{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 7416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.484304932735426e-10, "logits/chosen": -2.0979156494140625, "logits/rejected": -2.3109986782073975, "logps/chosen": -10.958471298217773, "logps/rejected": -10.488727569580078, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 8.968609865470852e-10, "logits/chosen": -2.171200752258301, "logits/rejected": -2.172977924346924, "logps/chosen": -11.140979766845703, "logps/rejected": -8.594125747680664, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.3452914798206277e-09, "logits/chosen": -2.2804348468780518, "logits/rejected": -2.1795849800109863, "logps/chosen": -42.27271270751953, "logps/rejected": -9.860451698303223, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": -0.009267807006835938, "rewards/margins": -0.0029623028822243214, "rewards/rejected": -0.006305504124611616, "step": 3 }, { "epoch": 0.0, "learning_rate": 1.7937219730941704e-09, "logits/chosen": -2.1906144618988037, "logits/rejected": -2.193382501602173, "logps/chosen": -18.959030151367188, "logps/rejected": -9.666114807128906, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.005116653628647327, "rewards/margins": 0.008443832397460938, "rewards/rejected": -0.0033271790016442537, "step": 4 }, { "epoch": 0.0, "learning_rate": 2.242152466367713e-09, "logits/chosen": -2.1522347927093506, "logits/rejected": -2.1507320404052734, "logps/chosen": -13.525835037231445, "logps/rejected": -11.254899978637695, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.010642051696777344, "rewards/margins": 0.007407951168715954, "rewards/rejected": 0.0032341002952307463, "step": 5 }, { "epoch": 0.0, "learning_rate": 2.6905829596412555e-09, "logits/chosen": -2.2679102420806885, "logits/rejected": -2.4126927852630615, "logps/chosen": -8.25623893737793, "logps/rejected": -7.917046546936035, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.0029366493690758944, "rewards/margins": 0.007938670925796032, "rewards/rejected": -0.005002021789550781, "step": 6 }, { "epoch": 0.0, "learning_rate": 3.139013452914798e-09, "logits/chosen": -2.122077226638794, "logits/rejected": -2.116199254989624, "logps/chosen": -20.23251724243164, "logps/rejected": -10.993452072143555, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": -0.004834366030991077, "rewards/margins": -0.002936077304184437, "rewards/rejected": -0.0018982887268066406, "step": 7 }, { "epoch": 0.0, "learning_rate": 3.5874439461883408e-09, "logits/chosen": -2.10577654838562, "logits/rejected": -2.3481462001800537, "logps/chosen": -9.51355266571045, "logps/rejected": -9.381867408752441, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.012026405893266201, "rewards/margins": 0.009794998914003372, "rewards/rejected": 0.0022314072120934725, "step": 8 }, { "epoch": 0.0, "learning_rate": 4.035874439461883e-09, "logits/chosen": -2.046794891357422, "logits/rejected": -2.284972667694092, "logps/chosen": -9.697582244873047, "logps/rejected": -9.77219009399414, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 4.0054324017546605e-06, "rewards/margins": -0.0017076493240892887, "rewards/rejected": 0.0017116547096520662, "step": 9 }, { "epoch": 0.01, "learning_rate": 4.484304932735426e-09, "logits/chosen": -2.1372909545898438, "logits/rejected": -2.2811264991760254, "logps/chosen": -7.20483922958374, "logps/rejected": -7.161868572235107, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.0063742161728441715, "rewards/margins": -0.001071167178452015, "rewards/rejected": 0.0074453833512961864, "step": 10 }, { "epoch": 0.01, "learning_rate": 4.932735426008968e-09, "logits/chosen": -2.2305495738983154, "logits/rejected": -2.236501693725586, "logps/chosen": -10.80730152130127, "logps/rejected": -7.724041938781738, "loss": 0.7, "rewards/accuracies": 0.0, "rewards/chosen": -0.015626240521669388, "rewards/margins": -0.01370153483003378, "rewards/rejected": -0.0019247055752202868, "step": 11 }, { "epoch": 0.01, "learning_rate": 5.381165919282511e-09, "logits/chosen": -2.1352505683898926, "logits/rejected": -2.207152843475342, "logps/chosen": -13.904718399047852, "logps/rejected": -26.961816787719727, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.007699108216911554, "rewards/margins": 0.013259220868349075, "rewards/rejected": -0.005560112185776234, "step": 12 }, { "epoch": 0.01, "learning_rate": 5.829596412556054e-09, "logits/chosen": -2.0938332080841064, "logits/rejected": -2.0994534492492676, "logps/chosen": -11.446889877319336, "logps/rejected": -8.46900463104248, "loss": 0.6996, "rewards/accuracies": 0.0, "rewards/chosen": -0.008844757452607155, "rewards/margins": -0.012906361371278763, "rewards/rejected": 0.004061603453010321, "step": 13 }, { "epoch": 0.01, "learning_rate": 6.278026905829596e-09, "logits/chosen": -2.1959187984466553, "logits/rejected": -2.1970467567443848, "logps/chosen": -13.465719223022461, "logps/rejected": -8.570552825927734, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": -0.004717445466667414, "rewards/margins": -0.005253314971923828, "rewards/rejected": 0.0005358696216717362, "step": 14 }, { "epoch": 0.01, "learning_rate": 6.726457399103139e-09, "logits/chosen": -2.1478323936462402, "logits/rejected": -2.1482763290405273, "logps/chosen": -18.833969116210938, "logps/rejected": -8.214381217956543, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.0017202377784997225, "rewards/margins": 0.00811223965138197, "rewards/rejected": -0.006392002105712891, "step": 15 }, { "epoch": 0.01, "learning_rate": 7.1748878923766815e-09, "logits/chosen": -2.122292995452881, "logits/rejected": -2.299795150756836, "logps/chosen": -9.416583061218262, "logps/rejected": -9.142372131347656, "loss": 0.7032, "rewards/accuracies": 0.0, "rewards/chosen": -0.009150790981948376, "rewards/margins": -0.020065974444150925, "rewards/rejected": 0.010915184393525124, "step": 16 }, { "epoch": 0.01, "learning_rate": 7.623318385650223e-09, "logits/chosen": -2.282637357711792, "logits/rejected": -2.337643623352051, "logps/chosen": -19.984046936035156, "logps/rejected": -32.187965393066406, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": -0.00324592599645257, "rewards/margins": -0.008197784423828125, "rewards/rejected": 0.004951858427375555, "step": 17 }, { "epoch": 0.01, "learning_rate": 8.071748878923766e-09, "logits/chosen": -2.0584051609039307, "logits/rejected": -2.061659097671509, "logps/chosen": -11.627410888671875, "logps/rejected": -12.29572868347168, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.0069335936568677425, "rewards/margins": 0.006900596432387829, "rewards/rejected": 3.2997133530443534e-05, "step": 18 }, { "epoch": 0.01, "learning_rate": 8.520179372197309e-09, "logits/chosen": -2.11080265045166, "logits/rejected": -2.107851982116699, "logps/chosen": -20.36676788330078, "logps/rejected": -10.951178550720215, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": -0.013389969244599342, "rewards/margins": -0.011485195718705654, "rewards/rejected": -0.0019047737587243319, "step": 19 }, { "epoch": 0.01, "learning_rate": 8.968609865470851e-09, "logits/chosen": -2.179429292678833, "logits/rejected": -2.1816587448120117, "logps/chosen": -11.602340698242188, "logps/rejected": -10.306724548339844, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": -0.0020270347595214844, "rewards/margins": -0.003483295440673828, "rewards/rejected": 0.0014562606811523438, "step": 20 }, { "epoch": 0.01, "learning_rate": 9.417040358744394e-09, "logits/chosen": -2.062178134918213, "logits/rejected": -2.056760311126709, "logps/chosen": -48.01906967163086, "logps/rejected": -13.899154663085938, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.01392974890768528, "rewards/margins": 0.005873489193618298, "rewards/rejected": 0.008056259714066982, "step": 21 }, { "epoch": 0.01, "learning_rate": 9.865470852017937e-09, "logits/chosen": -2.1647462844848633, "logits/rejected": -2.175149440765381, "logps/chosen": -16.655590057373047, "logps/rejected": -10.006885528564453, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.0037675858475267887, "rewards/margins": 0.008612537756562233, "rewards/rejected": -0.004844951909035444, "step": 22 }, { "epoch": 0.01, "learning_rate": 1.031390134529148e-08, "logits/chosen": -2.168323516845703, "logits/rejected": -2.0924549102783203, "logps/chosen": -31.409948348999023, "logps/rejected": -9.706562042236328, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": -0.005486488342285156, "rewards/margins": -0.009876346215605736, "rewards/rejected": 0.004389858338981867, "step": 23 }, { "epoch": 0.01, "learning_rate": 1.0762331838565022e-08, "logits/chosen": -2.055537700653076, "logits/rejected": -2.330641508102417, "logps/chosen": -8.751344680786133, "logps/rejected": -8.589177131652832, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.007161140441894531, "rewards/margins": 0.007083034608513117, "rewards/rejected": 7.81059279688634e-05, "step": 24 }, { "epoch": 0.01, "learning_rate": 1.1210762331838565e-08, "logits/chosen": -2.1600911617279053, "logits/rejected": -2.1584370136260986, "logps/chosen": -15.659576416015625, "logps/rejected": -9.455354690551758, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": -0.00469131488353014, "rewards/margins": 0.0014216420240700245, "rewards/rejected": -0.006112956907600164, "step": 25 }, { "epoch": 0.01, "learning_rate": 1.1659192825112107e-08, "logits/chosen": -2.1902377605438232, "logits/rejected": -2.190551519393921, "logps/chosen": -20.601390838623047, "logps/rejected": -18.08754539489746, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.010497665964066982, "rewards/margins": 0.018246270716190338, "rewards/rejected": -0.007748603820800781, "step": 26 }, { "epoch": 0.01, "learning_rate": 1.210762331838565e-08, "logits/chosen": -2.084904670715332, "logits/rejected": -2.093292236328125, "logps/chosen": -18.993885040283203, "logps/rejected": -9.99695110321045, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": -0.004439926240593195, "rewards/margins": -0.002069187117740512, "rewards/rejected": -0.002370739122852683, "step": 27 }, { "epoch": 0.02, "learning_rate": 1.2556053811659192e-08, "logits/chosen": -2.1882991790771484, "logits/rejected": -2.3575279712677, "logps/chosen": -20.142322540283203, "logps/rejected": -14.525827407836914, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": -0.010116005316376686, "rewards/margins": -0.003262138459831476, "rewards/rejected": -0.00685386685654521, "step": 28 }, { "epoch": 0.02, "learning_rate": 1.3004484304932733e-08, "logits/chosen": -2.0094985961914062, "logits/rejected": -2.2922613620758057, "logps/chosen": -10.89178466796875, "logps/rejected": -10.391555786132812, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.00951776560395956, "rewards/margins": 0.009990311227738857, "rewards/rejected": -0.0004725456237792969, "step": 29 }, { "epoch": 0.02, "learning_rate": 1.3452914798206278e-08, "logits/chosen": -2.0719823837280273, "logits/rejected": -2.3251965045928955, "logps/chosen": -9.654312133789062, "logps/rejected": -9.270055770874023, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": -0.010032844729721546, "rewards/margins": -0.007944202050566673, "rewards/rejected": -0.0020886422134935856, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.3901345291479822e-08, "logits/chosen": -2.082319498062134, "logits/rejected": -2.3233470916748047, "logps/chosen": -9.180187225341797, "logps/rejected": -9.012532234191895, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": -0.0019676208030432463, "rewards/margins": -0.004655265714973211, "rewards/rejected": 0.002687644911929965, "step": 31 }, { "epoch": 0.02, "learning_rate": 1.4349775784753363e-08, "logits/chosen": -2.0313689708709717, "logits/rejected": -2.0330817699432373, "logps/chosen": -12.790792465209961, "logps/rejected": -11.82142448425293, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": -0.000301170366583392, "rewards/margins": 0.006512832827866077, "rewards/rejected": -0.006814002990722656, "step": 32 }, { "epoch": 0.02, "learning_rate": 1.4798206278026906e-08, "logits/chosen": -2.217749834060669, "logits/rejected": -2.1237375736236572, "logps/chosen": -47.82718276977539, "logps/rejected": -9.560040473937988, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.013452529907226562, "rewards/margins": 0.018329525366425514, "rewards/rejected": -0.004876994993537664, "step": 33 }, { "epoch": 0.02, "learning_rate": 1.5246636771300447e-08, "logits/chosen": -2.0327343940734863, "logits/rejected": -2.311234951019287, "logps/chosen": -9.408660888671875, "logps/rejected": -9.276195526123047, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.001929473946802318, "rewards/margins": -0.0005926132434979081, "rewards/rejected": 0.002522087190300226, "step": 34 }, { "epoch": 0.02, "learning_rate": 1.569506726457399e-08, "logits/chosen": -2.019519805908203, "logits/rejected": -2.2516605854034424, "logps/chosen": -9.213037490844727, "logps/rejected": -8.878981590270996, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": -0.0011004448169842362, "rewards/margins": -0.0034648897126317024, "rewards/rejected": 0.0023644447792321444, "step": 35 }, { "epoch": 0.02, "learning_rate": 1.6143497757847532e-08, "logits/chosen": -2.02312970161438, "logits/rejected": -2.3072831630706787, "logps/chosen": -8.753149032592773, "logps/rejected": -8.37270450592041, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012904166942462325, "rewards/margins": 0.0071578980423510075, "rewards/rejected": -0.005867481231689453, "step": 36 }, { "epoch": 0.02, "learning_rate": 1.6591928251121076e-08, "logits/chosen": -2.0829267501831055, "logits/rejected": -2.0858778953552246, "logps/chosen": -16.962913513183594, "logps/rejected": -8.138038635253906, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.0025054931174963713, "rewards/margins": 0.002745246747508645, "rewards/rejected": -0.0002397537318756804, "step": 37 }, { "epoch": 0.02, "learning_rate": 1.7040358744394617e-08, "logits/chosen": -2.0977163314819336, "logits/rejected": -2.002244472503662, "logps/chosen": -45.645721435546875, "logps/rejected": -11.025367736816406, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.023189162835478783, "rewards/margins": 0.005823515355587006, "rewards/rejected": 0.017365647479891777, "step": 38 }, { "epoch": 0.02, "learning_rate": 1.748878923766816e-08, "logits/chosen": -2.164247751235962, "logits/rejected": -2.1675541400909424, "logps/chosen": -16.812410354614258, "logps/rejected": -13.18349838256836, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.012810135260224342, "rewards/margins": 0.014805984683334827, "rewards/rejected": -0.0019958496559411287, "step": 39 }, { "epoch": 0.02, "learning_rate": 1.7937219730941703e-08, "logits/chosen": -2.1965765953063965, "logits/rejected": -2.322868824005127, "logps/chosen": -14.265069961547852, "logps/rejected": -15.29957389831543, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.0007570266607217491, "rewards/margins": -0.010735702700912952, "rewards/rejected": 0.011492729187011719, "step": 40 }, { "epoch": 0.02, "learning_rate": 1.8385650224215247e-08, "logits/chosen": -2.1731631755828857, "logits/rejected": -2.183107852935791, "logps/chosen": -15.3982515335083, "logps/rejected": -21.407642364501953, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": -3.7860871088923886e-05, "rewards/margins": -0.0033284188248217106, "rewards/rejected": 0.003290558001026511, "step": 41 }, { "epoch": 0.02, "learning_rate": 1.8834080717488788e-08, "logits/chosen": -2.1261308193206787, "logits/rejected": -2.371284246444702, "logps/chosen": -11.46033763885498, "logps/rejected": -15.267311096191406, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": -0.004968738649040461, "rewards/margins": -0.021791839972138405, "rewards/rejected": 0.01682310178875923, "step": 42 }, { "epoch": 0.02, "learning_rate": 1.9282511210762332e-08, "logits/chosen": -2.191890239715576, "logits/rejected": -2.1920430660247803, "logps/chosen": -17.172622680664062, "logps/rejected": -10.345775604248047, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": -0.011552619747817516, "rewards/margins": -0.014843463897705078, "rewards/rejected": 0.003290843917056918, "step": 43 }, { "epoch": 0.02, "learning_rate": 1.9730941704035873e-08, "logits/chosen": -2.20365047454834, "logits/rejected": -2.2390387058258057, "logps/chosen": -25.083232879638672, "logps/rejected": -19.33491325378418, "loss": 0.7031, "rewards/accuracies": 0.0, "rewards/chosen": -0.0016122817760333419, "rewards/margins": -0.01974334754049778, "rewards/rejected": 0.018131066113710403, "step": 44 }, { "epoch": 0.02, "learning_rate": 2.0179372197309417e-08, "logits/chosen": -2.1107048988342285, "logits/rejected": -2.0678274631500244, "logps/chosen": -31.580238342285156, "logps/rejected": -10.914543151855469, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": -0.009390640072524548, "rewards/margins": -0.01107864361256361, "rewards/rejected": 0.0016880035400390625, "step": 45 }, { "epoch": 0.02, "learning_rate": 2.062780269058296e-08, "logits/chosen": -2.013056516647339, "logits/rejected": -2.0326292514801025, "logps/chosen": -10.121976852416992, "logps/rejected": -22.88250732421875, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": -0.005608177278190851, "rewards/margins": -0.011488151736557484, "rewards/rejected": 0.0058799744583666325, "step": 46 }, { "epoch": 0.03, "learning_rate": 2.10762331838565e-08, "logits/chosen": -2.204366683959961, "logits/rejected": -2.347001314163208, "logps/chosen": -13.06828498840332, "logps/rejected": -13.000082015991211, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015986443031579256, "rewards/margins": 0.009407139383256435, "rewards/rejected": -0.011005783453583717, "step": 47 }, { "epoch": 0.03, "learning_rate": 2.1524663677130044e-08, "logits/chosen": -2.0366978645324707, "logits/rejected": -2.032083511352539, "logps/chosen": -11.414079666137695, "logps/rejected": -9.905214309692383, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.004105949308723211, "rewards/margins": -0.00221862830221653, "rewards/rejected": 0.006324577610939741, "step": 48 }, { "epoch": 0.03, "learning_rate": 2.1973094170403585e-08, "logits/chosen": -2.226799964904785, "logits/rejected": -2.222607135772705, "logps/chosen": -20.071346282958984, "logps/rejected": -9.324289321899414, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": -0.01372604351490736, "rewards/margins": -0.009464740753173828, "rewards/rejected": -0.004261303227394819, "step": 49 }, { "epoch": 0.03, "learning_rate": 2.242152466367713e-08, "logits/chosen": -2.036208152770996, "logits/rejected": -2.303960084915161, "logps/chosen": -10.230937957763672, "logps/rejected": -10.10129165649414, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.0061286925338208675, "rewards/margins": 0.0055678365752100945, "rewards/rejected": 0.0005608559004031122, "step": 50 }, { "epoch": 0.03, "learning_rate": 2.286995515695067e-08, "logits/chosen": -2.0005571842193604, "logits/rejected": -2.0015511512756348, "logps/chosen": -10.7401123046875, "logps/rejected": -8.49370288848877, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": -0.008684731088578701, "rewards/margins": 0.00394744798541069, "rewards/rejected": -0.012632179073989391, "step": 51 }, { "epoch": 0.03, "learning_rate": 2.3318385650224214e-08, "logits/chosen": -2.1137096881866455, "logits/rejected": -2.3734209537506104, "logps/chosen": -11.088902473449707, "logps/rejected": -11.069307327270508, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": -0.0016759872669354081, "rewards/margins": -0.0047893524169921875, "rewards/rejected": 0.003113365266472101, "step": 52 }, { "epoch": 0.03, "learning_rate": 2.3766816143497755e-08, "logits/chosen": -2.0832955837249756, "logits/rejected": -2.33752179145813, "logps/chosen": -11.344663619995117, "logps/rejected": -10.899211883544922, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.005830955691635609, "rewards/margins": 0.004732227418571711, "rewards/rejected": 0.0010987281566485763, "step": 53 }, { "epoch": 0.03, "learning_rate": 2.42152466367713e-08, "logits/chosen": -2.128887176513672, "logits/rejected": -2.117245674133301, "logps/chosen": -27.3739013671875, "logps/rejected": -9.35929012298584, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": -0.009698867797851562, "rewards/margins": -0.001710318960249424, "rewards/rejected": -0.007988548837602139, "step": 54 }, { "epoch": 0.03, "learning_rate": 2.466367713004484e-08, "logits/chosen": -1.9909394979476929, "logits/rejected": -2.2808797359466553, "logps/chosen": -10.750165939331055, "logps/rejected": -10.1879301071167, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.0011767387622967362, "rewards/margins": -0.001983833499252796, "rewards/rejected": 0.0031605721451342106, "step": 55 }, { "epoch": 0.03, "learning_rate": 2.5112107623318385e-08, "logits/chosen": -2.061950922012329, "logits/rejected": -2.0567033290863037, "logps/chosen": -17.188932418823242, "logps/rejected": -8.683694839477539, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.006194496061652899, "rewards/margins": 0.003715324215590954, "rewards/rejected": 0.002479171846061945, "step": 56 }, { "epoch": 0.03, "learning_rate": 2.556053811659193e-08, "logits/chosen": -2.0040969848632812, "logits/rejected": -2.360913038253784, "logps/chosen": -9.527205467224121, "logps/rejected": -22.24579620361328, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": -0.00027570725069381297, "rewards/margins": 0.009847927838563919, "rewards/rejected": -0.010123634710907936, "step": 57 }, { "epoch": 0.03, "learning_rate": 2.6008968609865467e-08, "logits/chosen": -2.2485427856445312, "logits/rejected": -2.3744025230407715, "logps/chosen": -13.319928169250488, "logps/rejected": -12.437515258789062, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018349647289142013, "rewards/margins": -0.006390094757080078, "rewards/rejected": 0.004555129911750555, "step": 58 }, { "epoch": 0.03, "learning_rate": 2.645739910313901e-08, "logits/chosen": -2.151921510696411, "logits/rejected": -2.314770221710205, "logps/chosen": -9.859323501586914, "logps/rejected": -9.644436836242676, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": -0.0034657479263842106, "rewards/margins": -0.0017733575077727437, "rewards/rejected": -0.0016923904186114669, "step": 59 }, { "epoch": 0.03, "learning_rate": 2.6905829596412556e-08, "logits/chosen": -2.069385051727295, "logits/rejected": -2.0765883922576904, "logps/chosen": -11.373144149780273, "logps/rejected": -10.947542190551758, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.006391429807990789, "rewards/margins": -0.0008979798294603825, "rewards/rejected": 0.007289409637451172, "step": 60 }, { "epoch": 0.03, "learning_rate": 2.73542600896861e-08, "logits/chosen": -2.074838638305664, "logits/rejected": -2.2899177074432373, "logps/chosen": -9.630131721496582, "logps/rejected": -9.597940444946289, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.007001018617302179, "rewards/margins": 0.0005183219909667969, "rewards/rejected": 0.0064826966263353825, "step": 61 }, { "epoch": 0.03, "learning_rate": 2.7802690582959644e-08, "logits/chosen": -2.12103271484375, "logits/rejected": -2.142998218536377, "logps/chosen": -14.921041488647461, "logps/rejected": -13.013690948486328, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.002244377275928855, "rewards/margins": 0.011725998483598232, "rewards/rejected": -0.009481620974838734, "step": 62 }, { "epoch": 0.03, "learning_rate": 2.8251121076233182e-08, "logits/chosen": -2.0864696502685547, "logits/rejected": -2.240133047103882, "logps/chosen": -8.901256561279297, "logps/rejected": -8.808066368103027, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.014261818490922451, "rewards/margins": 0.008964348584413528, "rewards/rejected": 0.00529747037217021, "step": 63 }, { "epoch": 0.03, "learning_rate": 2.8699551569506726e-08, "logits/chosen": -2.2118148803710938, "logits/rejected": -2.238858222961426, "logps/chosen": -33.84086608886719, "logps/rejected": -14.047714233398438, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": -0.0034194947220385075, "rewards/margins": -0.0005533217918127775, "rewards/rejected": -0.00286617293022573, "step": 64 }, { "epoch": 0.04, "learning_rate": 2.9147982062780267e-08, "logits/chosen": -2.1135852336883545, "logits/rejected": -2.115546464920044, "logps/chosen": -12.67255973815918, "logps/rejected": -11.29073715209961, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.000591278076171875, "rewards/margins": -0.0006863594753667712, "rewards/rejected": 0.0012776375515386462, "step": 65 }, { "epoch": 0.04, "learning_rate": 2.959641255605381e-08, "logits/chosen": -2.006624460220337, "logits/rejected": -2.2874600887298584, "logps/chosen": -9.63341236114502, "logps/rejected": -9.378401756286621, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.007855415344238281, "rewards/margins": 0.0070037841796875, "rewards/rejected": 0.0008516311645507812, "step": 66 }, { "epoch": 0.04, "learning_rate": 3.004484304932735e-08, "logits/chosen": -2.2065813541412354, "logits/rejected": -2.186397075653076, "logps/chosen": -26.787425994873047, "logps/rejected": -13.201791763305664, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.0022727965842932463, "rewards/margins": -0.0013027191162109375, "rewards/rejected": 0.0035755157005041838, "step": 67 }, { "epoch": 0.04, "learning_rate": 3.0493273542600893e-08, "logits/chosen": -2.1411290168762207, "logits/rejected": -2.138977289199829, "logps/chosen": -21.950511932373047, "logps/rejected": -8.461540222167969, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.0064178467728197575, "rewards/margins": 0.00044794101268053055, "rewards/rejected": 0.005969905760139227, "step": 68 }, { "epoch": 0.04, "learning_rate": 3.094170403587444e-08, "logits/chosen": -2.031545877456665, "logits/rejected": -2.0380606651306152, "logps/chosen": -15.374809265136719, "logps/rejected": -9.087368965148926, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": -0.0022771835792809725, "rewards/margins": -0.007164192385971546, "rewards/rejected": 0.00488700857385993, "step": 69 }, { "epoch": 0.04, "learning_rate": 3.139013452914798e-08, "logits/chosen": -2.2778728008270264, "logits/rejected": -2.2770164012908936, "logps/chosen": -12.302946090698242, "logps/rejected": -9.111112594604492, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": -0.007271003909409046, "rewards/margins": -0.01146545447409153, "rewards/rejected": 0.004194450564682484, "step": 70 }, { "epoch": 0.04, "learning_rate": 3.183856502242152e-08, "logits/chosen": -2.148139238357544, "logits/rejected": -2.150254249572754, "logps/chosen": -10.213472366333008, "logps/rejected": -9.785621643066406, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.0011060715187340975, "rewards/margins": -0.013888549990952015, "rewards/rejected": 0.014994621276855469, "step": 71 }, { "epoch": 0.04, "learning_rate": 3.2286995515695064e-08, "logits/chosen": -2.1833627223968506, "logits/rejected": -2.3227546215057373, "logps/chosen": -8.625642776489258, "logps/rejected": -8.512284278869629, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.008752060122787952, "rewards/margins": 0.008170604705810547, "rewards/rejected": 0.0005814552423544228, "step": 72 }, { "epoch": 0.04, "learning_rate": 3.273542600896861e-08, "logits/chosen": -2.0864365100860596, "logits/rejected": -2.0781664848327637, "logps/chosen": -19.97391128540039, "logps/rejected": -10.473222732543945, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.013383865356445312, "rewards/margins": 0.0044394489377737045, "rewards/rejected": 0.008944416418671608, "step": 73 }, { "epoch": 0.04, "learning_rate": 3.318385650224215e-08, "logits/chosen": -2.1393914222717285, "logits/rejected": -2.1412086486816406, "logps/chosen": -10.679883003234863, "logps/rejected": -9.8320951461792, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.003774356795474887, "rewards/margins": -0.0015248300042003393, "rewards/rejected": 0.005299186799675226, "step": 74 }, { "epoch": 0.04, "learning_rate": 3.36322869955157e-08, "logits/chosen": -2.0669848918914795, "logits/rejected": -2.0619335174560547, "logps/chosen": -15.047464370727539, "logps/rejected": -9.684029579162598, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.001528930733911693, "rewards/margins": 0.0038137435913085938, "rewards/rejected": -0.0022848129738122225, "step": 75 }, { "epoch": 0.04, "learning_rate": 3.4080717488789235e-08, "logits/chosen": -2.0803909301757812, "logits/rejected": -2.0901777744293213, "logps/chosen": -14.437600135803223, "logps/rejected": -9.265451431274414, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.023670673370361328, "rewards/margins": 0.02168264426290989, "rewards/rejected": 0.001988029573112726, "step": 76 }, { "epoch": 0.04, "learning_rate": 3.452914798206278e-08, "logits/chosen": -2.080399513244629, "logits/rejected": -2.076751708984375, "logps/chosen": -23.027435302734375, "logps/rejected": -9.081665992736816, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 0.007004547398537397, "rewards/margins": -0.016385937109589577, "rewards/rejected": 0.02339048497378826, "step": 77 }, { "epoch": 0.04, "learning_rate": 3.497757847533632e-08, "logits/chosen": -2.139862298965454, "logits/rejected": -2.144566059112549, "logps/chosen": -23.107257843017578, "logps/rejected": -8.340372085571289, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.019701004028320312, "rewards/margins": 0.014268780127167702, "rewards/rejected": 0.005432224366813898, "step": 78 }, { "epoch": 0.04, "learning_rate": 3.542600896860987e-08, "logits/chosen": -2.118016004562378, "logits/rejected": -2.1104118824005127, "logps/chosen": -20.992252349853516, "logps/rejected": -12.684316635131836, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.0036623000632971525, "rewards/margins": 0.0011895177885890007, "rewards/rejected": 0.002472782274708152, "step": 79 }, { "epoch": 0.04, "learning_rate": 3.5874439461883405e-08, "logits/chosen": -2.225069522857666, "logits/rejected": -2.2532010078430176, "logps/chosen": -20.482357025146484, "logps/rejected": -20.78835105895996, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 0.019568635150790215, "rewards/margins": 0.041452981531620026, "rewards/rejected": -0.02188434638082981, "step": 80 }, { "epoch": 0.04, "learning_rate": 3.632286995515695e-08, "logits/chosen": -2.0844480991363525, "logits/rejected": -2.079559803009033, "logps/chosen": -12.018669128417969, "logps/rejected": -8.865294456481934, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.01151886023581028, "rewards/margins": 0.0017621992155909538, "rewards/rejected": 0.009756661020219326, "step": 81 }, { "epoch": 0.04, "learning_rate": 3.6771300448430494e-08, "logits/chosen": -2.1681113243103027, "logits/rejected": -2.172297239303589, "logps/chosen": -15.33029842376709, "logps/rejected": -8.158651351928711, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.013186645694077015, "rewards/margins": 0.0076732635498046875, "rewards/rejected": 0.005513382144272327, "step": 82 }, { "epoch": 0.04, "learning_rate": 3.721973094170404e-08, "logits/chosen": -2.061526298522949, "logits/rejected": -2.0643839836120605, "logps/chosen": -10.386487007141113, "logps/rejected": -10.729504585266113, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.007389545440673828, "rewards/margins": 0.006200027652084827, "rewards/rejected": 0.0011895180214196444, "step": 83 }, { "epoch": 0.05, "learning_rate": 3.7668161434977576e-08, "logits/chosen": -2.1326401233673096, "logits/rejected": -2.1434714794158936, "logps/chosen": -18.549726486206055, "logps/rejected": -15.390289306640625, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.00934600830078125, "rewards/margins": -0.004179954528808594, "rewards/rejected": 0.013525962829589844, "step": 84 }, { "epoch": 0.05, "learning_rate": 3.811659192825112e-08, "logits/chosen": -2.021798849105835, "logits/rejected": -2.0187878608703613, "logps/chosen": -23.87000274658203, "logps/rejected": -9.498292922973633, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.007972336374223232, "rewards/margins": -0.0019550323486328125, "rewards/rejected": 0.009927368722856045, "step": 85 }, { "epoch": 0.05, "learning_rate": 3.8565022421524664e-08, "logits/chosen": -2.1725194454193115, "logits/rejected": -2.181325912475586, "logps/chosen": -15.940302848815918, "logps/rejected": -11.908823013305664, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.010583973489701748, "rewards/margins": 4.024524241685867e-05, "rewards/rejected": 0.01054372824728489, "step": 86 }, { "epoch": 0.05, "learning_rate": 3.901345291479821e-08, "logits/chosen": -2.146444320678711, "logits/rejected": -2.294748067855835, "logps/chosen": -12.352265357971191, "logps/rejected": -10.713945388793945, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.007040500640869141, "rewards/margins": 0.0029088021256029606, "rewards/rejected": 0.00413169851526618, "step": 87 }, { "epoch": 0.05, "learning_rate": 3.9461883408071746e-08, "logits/chosen": -2.001359462738037, "logits/rejected": -2.2578251361846924, "logps/chosen": -9.415419578552246, "logps/rejected": -9.54137897491455, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.006729793734848499, "rewards/margins": -0.0015927311033010483, "rewards/rejected": 0.008322524838149548, "step": 88 }, { "epoch": 0.05, "learning_rate": 3.991031390134529e-08, "logits/chosen": -2.0692389011383057, "logits/rejected": -2.077415704727173, "logps/chosen": -11.641901016235352, "logps/rejected": -18.54785919189453, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.010214042849838734, "rewards/margins": 0.018882371485233307, "rewards/rejected": -0.008668327704071999, "step": 89 }, { "epoch": 0.05, "learning_rate": 4.0358744394618835e-08, "logits/chosen": -2.1643319129943848, "logits/rejected": -2.1580073833465576, "logps/chosen": -20.824539184570312, "logps/rejected": -10.597345352172852, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.012961006723344326, "rewards/margins": -0.0034938817843794823, "rewards/rejected": 0.01645488850772381, "step": 90 }, { "epoch": 0.05, "learning_rate": 4.080717488789238e-08, "logits/chosen": -2.141144037246704, "logits/rejected": -2.1386516094207764, "logps/chosen": -14.894718170166016, "logps/rejected": -10.045269966125488, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": -7.286071922862902e-05, "rewards/margins": -0.015377235598862171, "rewards/rejected": 0.015304374508559704, "step": 91 }, { "epoch": 0.05, "learning_rate": 4.125560538116592e-08, "logits/chosen": -2.1839067935943604, "logits/rejected": -2.1872270107269287, "logps/chosen": -12.337912559509277, "logps/rejected": -10.837175369262695, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.021632671356201172, "rewards/margins": 0.009337520226836205, "rewards/rejected": 0.012295151129364967, "step": 92 }, { "epoch": 0.05, "learning_rate": 4.170403587443946e-08, "logits/chosen": -2.2160303592681885, "logits/rejected": -2.3270158767700195, "logps/chosen": -21.48740577697754, "logps/rejected": -24.111907958984375, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": -0.007700157351791859, "rewards/margins": 0.0029390333220362663, "rewards/rejected": -0.010639190673828125, "step": 93 }, { "epoch": 0.05, "learning_rate": 4.2152466367713e-08, "logits/chosen": -2.223874568939209, "logits/rejected": -2.2208216190338135, "logps/chosen": -24.745744705200195, "logps/rejected": -8.658109664916992, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.01878795586526394, "rewards/margins": 0.020111465826630592, "rewards/rejected": -0.0013235092628747225, "step": 94 }, { "epoch": 0.05, "learning_rate": 4.260089686098654e-08, "logits/chosen": -2.0548484325408936, "logits/rejected": -2.310408353805542, "logps/chosen": -10.321556091308594, "logps/rejected": -10.276147842407227, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.013272285461425781, "rewards/margins": 0.008576774969696999, "rewards/rejected": 0.00469551095739007, "step": 95 }, { "epoch": 0.05, "learning_rate": 4.304932735426009e-08, "logits/chosen": -2.1139299869537354, "logits/rejected": -2.353086233139038, "logps/chosen": -10.415892601013184, "logps/rejected": -10.733854293823242, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.01840515248477459, "rewards/margins": 0.013757133856415749, "rewards/rejected": 0.004648018162697554, "step": 96 }, { "epoch": 0.05, "learning_rate": 4.3497757847533625e-08, "logits/chosen": -2.033691883087158, "logits/rejected": -2.0337750911712646, "logps/chosen": -14.742208480834961, "logps/rejected": -8.713153839111328, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": -0.0001445770321879536, "rewards/margins": -0.005589103791862726, "rewards/rejected": 0.005444526672363281, "step": 97 }, { "epoch": 0.05, "learning_rate": 4.394618834080717e-08, "logits/chosen": -2.1012351512908936, "logits/rejected": -2.007627487182617, "logps/chosen": -53.42965316772461, "logps/rejected": -9.90536880493164, "loss": 0.7032, "rewards/accuracies": 0.0, "rewards/chosen": -0.010973358526825905, "rewards/margins": -0.01997079886496067, "rewards/rejected": 0.008997440338134766, "step": 98 }, { "epoch": 0.05, "learning_rate": 4.4394618834080714e-08, "logits/chosen": -2.1733200550079346, "logits/rejected": -2.0552053451538086, "logps/chosen": -56.150184631347656, "logps/rejected": -8.956911087036133, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.019803239032626152, "rewards/margins": -0.00043811649084091187, "rewards/rejected": 0.020241355523467064, "step": 99 }, { "epoch": 0.05, "learning_rate": 4.484304932735426e-08, "logits/chosen": -2.1699624061584473, "logits/rejected": -2.364610433578491, "logps/chosen": -10.075794219970703, "logps/rejected": -9.832568168640137, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": -0.0002401351957814768, "rewards/margins": -0.00965414009988308, "rewards/rejected": 0.0094140050932765, "step": 100 }, { "epoch": 0.05, "learning_rate": 4.5291479820627796e-08, "logits/chosen": -2.1663384437561035, "logits/rejected": -2.1600072383880615, "logps/chosen": -24.121322631835938, "logps/rejected": -8.981534957885742, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.043140411376953125, "rewards/margins": 0.029755115509033203, "rewards/rejected": 0.013385295867919922, "step": 101 }, { "epoch": 0.06, "learning_rate": 4.573991031390134e-08, "logits/chosen": -2.1511003971099854, "logits/rejected": -2.144071578979492, "logps/chosen": -31.566492080688477, "logps/rejected": -10.3168363571167, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.009843445383012295, "rewards/margins": 0.007290077395737171, "rewards/rejected": 0.00255336775444448, "step": 102 }, { "epoch": 0.06, "learning_rate": 4.6188340807174884e-08, "logits/chosen": -2.0356526374816895, "logits/rejected": -2.2458300590515137, "logps/chosen": -9.29807186126709, "logps/rejected": -9.306663513183594, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.015076733194291592, "rewards/margins": 0.0007605552673339844, "rewards/rejected": 0.014316177926957607, "step": 103 }, { "epoch": 0.06, "learning_rate": 4.663677130044843e-08, "logits/chosen": -2.1734111309051514, "logits/rejected": -2.2792162895202637, "logps/chosen": -9.930583953857422, "logps/rejected": -9.384129524230957, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.024132346734404564, "rewards/margins": -0.0032509807497262955, "rewards/rejected": 0.02738332748413086, "step": 104 }, { "epoch": 0.06, "learning_rate": 4.708520179372197e-08, "logits/chosen": -2.1704792976379395, "logits/rejected": -2.3056838512420654, "logps/chosen": -9.841588020324707, "logps/rejected": -9.82347297668457, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.0127716064453125, "rewards/margins": 0.003629016689956188, "rewards/rejected": 0.009142589755356312, "step": 105 }, { "epoch": 0.06, "learning_rate": 4.753363228699551e-08, "logits/chosen": -2.1657538414001465, "logits/rejected": -2.1707890033721924, "logps/chosen": -10.974058151245117, "logps/rejected": -8.239066123962402, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.016999339684844017, "rewards/margins": 0.012023829855024815, "rewards/rejected": 0.004975509829819202, "step": 106 }, { "epoch": 0.06, "learning_rate": 4.7982062780269055e-08, "logits/chosen": -2.054334878921509, "logits/rejected": -2.318596839904785, "logps/chosen": -10.845199584960938, "logps/rejected": -10.822538375854492, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 0.011339283548295498, "rewards/margins": -0.01534814853221178, "rewards/rejected": 0.02668743208050728, "step": 107 }, { "epoch": 0.06, "learning_rate": 4.84304932735426e-08, "logits/chosen": -2.217080593109131, "logits/rejected": -2.143556594848633, "logps/chosen": -52.441341400146484, "logps/rejected": -8.858478546142578, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.03502006456255913, "rewards/margins": 0.027597617357969284, "rewards/rejected": 0.007422447204589844, "step": 108 }, { "epoch": 0.06, "learning_rate": 4.8878923766816144e-08, "logits/chosen": -2.1846086978912354, "logits/rejected": -2.1770896911621094, "logps/chosen": -21.62548065185547, "logps/rejected": -8.9298095703125, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 0.0017780304187908769, "rewards/margins": -0.011945819482207298, "rewards/rejected": 0.01372385025024414, "step": 109 }, { "epoch": 0.06, "learning_rate": 4.932735426008968e-08, "logits/chosen": -2.1917223930358887, "logits/rejected": -2.1849918365478516, "logps/chosen": -13.728654861450195, "logps/rejected": -11.999414443969727, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.017127132043242455, "rewards/margins": -0.004368115216493607, "rewards/rejected": 0.02149524725973606, "step": 110 }, { "epoch": 0.06, "learning_rate": 4.9775784753363226e-08, "logits/chosen": -2.2624897956848145, "logits/rejected": -2.2217416763305664, "logps/chosen": -44.67522430419922, "logps/rejected": -11.847160339355469, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.020919037982821465, "rewards/margins": 0.006544590927660465, "rewards/rejected": 0.014374447055161, "step": 111 }, { "epoch": 0.06, "learning_rate": 5.022421524663677e-08, "logits/chosen": -2.104403018951416, "logits/rejected": -2.1149179935455322, "logps/chosen": -12.03296947479248, "logps/rejected": -9.840739250183105, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.010851669125258923, "rewards/margins": 0.0022958749905228615, "rewards/rejected": 0.008555794134736061, "step": 112 }, { "epoch": 0.06, "learning_rate": 5.0672645739910314e-08, "logits/chosen": -2.1389501094818115, "logits/rejected": -2.2814815044403076, "logps/chosen": -11.094829559326172, "logps/rejected": -11.00583267211914, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": 0.009608650580048561, "rewards/margins": -0.009204482659697533, "rewards/rejected": 0.018813133239746094, "step": 113 }, { "epoch": 0.06, "learning_rate": 5.112107623318386e-08, "logits/chosen": -2.135303497314453, "logits/rejected": -2.330822467803955, "logps/chosen": -14.642257690429688, "logps/rejected": -14.10554313659668, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.024247361347079277, "rewards/margins": 0.011442757211625576, "rewards/rejected": 0.012804604135453701, "step": 114 }, { "epoch": 0.06, "learning_rate": 5.15695067264574e-08, "logits/chosen": -2.2490501403808594, "logits/rejected": -2.1105539798736572, "logps/chosen": -62.304412841796875, "logps/rejected": -8.984655380249023, "loss": 0.6604, "rewards/accuracies": 1.0, "rewards/chosen": 0.07448463886976242, "rewards/margins": 0.0665907934308052, "rewards/rejected": 0.007893848232924938, "step": 115 }, { "epoch": 0.06, "learning_rate": 5.2017937219730934e-08, "logits/chosen": -2.2422280311584473, "logits/rejected": -2.2706260681152344, "logps/chosen": -9.844327926635742, "logps/rejected": -15.087574005126953, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.02075948752462864, "rewards/margins": 0.02462015114724636, "rewards/rejected": -0.0038606643211096525, "step": 116 }, { "epoch": 0.06, "learning_rate": 5.246636771300448e-08, "logits/chosen": -2.076237916946411, "logits/rejected": -2.3247225284576416, "logps/chosen": -16.43852424621582, "logps/rejected": -16.27741241455078, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.03678112104535103, "rewards/margins": 0.019771194085478783, "rewards/rejected": 0.017009926959872246, "step": 117 }, { "epoch": 0.06, "learning_rate": 5.291479820627802e-08, "logits/chosen": -2.156921863555908, "logits/rejected": -2.305867910385132, "logps/chosen": -7.067009925842285, "logps/rejected": -6.753619194030762, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.018022824078798294, "rewards/margins": -0.004119491204619408, "rewards/rejected": 0.0221423152834177, "step": 118 }, { "epoch": 0.06, "learning_rate": 5.336322869955157e-08, "logits/chosen": -2.1750025749206543, "logits/rejected": -2.333986520767212, "logps/chosen": -7.491654872894287, "logps/rejected": -7.278430461883545, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.023765278980135918, "rewards/margins": -0.00045604631304740906, "rewards/rejected": 0.024221325293183327, "step": 119 }, { "epoch": 0.06, "learning_rate": 5.381165919282511e-08, "logits/chosen": -2.0833475589752197, "logits/rejected": -2.3101279735565186, "logps/chosen": -8.644104957580566, "logps/rejected": -8.440065383911133, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 0.01763925515115261, "rewards/margins": -0.008865071460604668, "rewards/rejected": 0.02650432661175728, "step": 120 }, { "epoch": 0.07, "learning_rate": 5.4260089686098655e-08, "logits/chosen": -2.059966564178467, "logits/rejected": -2.064059257507324, "logps/chosen": -16.735313415527344, "logps/rejected": -12.177474975585938, "loss": 0.7007, "rewards/accuracies": 0.0, "rewards/chosen": 0.012438202276825905, "rewards/margins": -0.014995764940977097, "rewards/rejected": 0.027433967217803, "step": 121 }, { "epoch": 0.07, "learning_rate": 5.47085201793722e-08, "logits/chosen": -2.0977611541748047, "logits/rejected": -2.0955312252044678, "logps/chosen": -12.011667251586914, "logps/rejected": -9.138339042663574, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.035752203315496445, "rewards/margins": 0.0299790408462286, "rewards/rejected": 0.0057731629349291325, "step": 122 }, { "epoch": 0.07, "learning_rate": 5.5156950672645744e-08, "logits/chosen": -2.044938564300537, "logits/rejected": -2.337332248687744, "logps/chosen": -10.960025787353516, "logps/rejected": -10.792793273925781, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.03305549547076225, "rewards/margins": 0.0061788540333509445, "rewards/rejected": 0.02687664143741131, "step": 123 }, { "epoch": 0.07, "learning_rate": 5.560538116591929e-08, "logits/chosen": -2.121669054031372, "logits/rejected": -2.1253552436828613, "logps/chosen": -12.12099552154541, "logps/rejected": -16.071216583251953, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.03081836737692356, "rewards/margins": 0.021753884851932526, "rewards/rejected": 0.00906448345631361, "step": 124 }, { "epoch": 0.07, "learning_rate": 5.605381165919282e-08, "logits/chosen": -2.131110906600952, "logits/rejected": -2.330775499343872, "logps/chosen": -14.662057876586914, "logps/rejected": -9.261777877807617, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 0.02599973790347576, "rewards/margins": -0.009143734350800514, "rewards/rejected": 0.035143472254276276, "step": 125 }, { "epoch": 0.07, "learning_rate": 5.6502242152466364e-08, "logits/chosen": -2.0999538898468018, "logits/rejected": -2.1013946533203125, "logps/chosen": -13.189610481262207, "logps/rejected": -11.380970001220703, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.025351524353027344, "rewards/margins": 0.023193931207060814, "rewards/rejected": 0.002157592913135886, "step": 126 }, { "epoch": 0.07, "learning_rate": 5.695067264573991e-08, "logits/chosen": -2.081967353820801, "logits/rejected": -2.090820074081421, "logps/chosen": -12.203471183776855, "logps/rejected": -7.086677551269531, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.03315391764044762, "rewards/margins": 0.02081756666302681, "rewards/rejected": 0.012336350046098232, "step": 127 }, { "epoch": 0.07, "learning_rate": 5.739910313901345e-08, "logits/chosen": -2.2564637660980225, "logits/rejected": -2.2271780967712402, "logps/chosen": -42.34345245361328, "logps/rejected": -22.26595687866211, "loss": 0.6587, "rewards/accuracies": 1.0, "rewards/chosen": 0.059456635266542435, "rewards/margins": 0.07014350593090057, "rewards/rejected": -0.010686874389648438, "step": 128 }, { "epoch": 0.07, "learning_rate": 5.7847533632286997e-08, "logits/chosen": -2.1771492958068848, "logits/rejected": -2.177029609680176, "logps/chosen": -11.893133163452148, "logps/rejected": -8.992386817932129, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.043456077575683594, "rewards/margins": 0.03532571718096733, "rewards/rejected": 0.008130359463393688, "step": 129 }, { "epoch": 0.07, "learning_rate": 5.8295964125560534e-08, "logits/chosen": -2.222480058670044, "logits/rejected": -2.372687816619873, "logps/chosen": -11.703049659729004, "logps/rejected": -14.85295581817627, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.019735241308808327, "rewards/margins": -0.0007623676210641861, "rewards/rejected": 0.020497608929872513, "step": 130 }, { "epoch": 0.07, "learning_rate": 5.874439461883408e-08, "logits/chosen": -2.0909852981567383, "logits/rejected": -2.0932981967926025, "logps/chosen": -18.831466674804688, "logps/rejected": -10.813238143920898, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.04414978250861168, "rewards/margins": 0.00855398178100586, "rewards/rejected": 0.03559580072760582, "step": 131 }, { "epoch": 0.07, "learning_rate": 5.919282511210762e-08, "logits/chosen": -2.1733639240264893, "logits/rejected": -2.1704630851745605, "logps/chosen": -21.049030303955078, "logps/rejected": -8.652996063232422, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.03307361528277397, "rewards/margins": 0.0012563690543174744, "rewards/rejected": 0.0318172462284565, "step": 132 }, { "epoch": 0.07, "learning_rate": 5.964125560538115e-08, "logits/chosen": -2.01060152053833, "logits/rejected": -2.011457681655884, "logps/chosen": -9.648884773254395, "logps/rejected": -9.36882209777832, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.036797523498535156, "rewards/margins": 0.019031714648008347, "rewards/rejected": 0.01776580885052681, "step": 133 }, { "epoch": 0.07, "learning_rate": 6.00896860986547e-08, "logits/chosen": -2.150963306427002, "logits/rejected": -2.1618306636810303, "logps/chosen": -12.842151641845703, "logps/rejected": -15.747689247131348, "loss": 0.7033, "rewards/accuracies": 0.0, "rewards/chosen": 0.00969076156616211, "rewards/margins": -0.020176315680146217, "rewards/rejected": 0.029867077246308327, "step": 134 }, { "epoch": 0.07, "learning_rate": 6.053811659192824e-08, "logits/chosen": -2.150934934616089, "logits/rejected": -2.137765645980835, "logps/chosen": -27.22845458984375, "logps/rejected": -6.653388977050781, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.06056804582476616, "rewards/margins": 0.03313488885760307, "rewards/rejected": 0.027433156967163086, "step": 135 }, { "epoch": 0.07, "learning_rate": 6.098654708520179e-08, "logits/chosen": -2.203087568283081, "logits/rejected": -2.1973745822906494, "logps/chosen": -19.172046661376953, "logps/rejected": -9.989663124084473, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.021472549065947533, "rewards/margins": 0.004174517467617989, "rewards/rejected": 0.017298031598329544, "step": 136 }, { "epoch": 0.07, "learning_rate": 6.143497757847533e-08, "logits/chosen": -2.030684232711792, "logits/rejected": -2.0328307151794434, "logps/chosen": -8.996236801147461, "logps/rejected": -7.539095878601074, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.032943155616521835, "rewards/margins": 0.003959991037845612, "rewards/rejected": 0.028983164578676224, "step": 137 }, { "epoch": 0.07, "learning_rate": 6.188340807174888e-08, "logits/chosen": -2.205660820007324, "logits/rejected": -2.337773561477661, "logps/chosen": -8.751143455505371, "logps/rejected": -8.355916023254395, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.021953297778964043, "rewards/margins": -0.010481452569365501, "rewards/rejected": 0.032434750348329544, "step": 138 }, { "epoch": 0.07, "learning_rate": 6.233183856502242e-08, "logits/chosen": -2.0280184745788574, "logits/rejected": -2.2849411964416504, "logps/chosen": -9.521728515625, "logps/rejected": -9.248092651367188, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.027526570484042168, "rewards/margins": -0.00782785378396511, "rewards/rejected": 0.03535442426800728, "step": 139 }, { "epoch": 0.08, "learning_rate": 6.278026905829596e-08, "logits/chosen": -2.141062021255493, "logits/rejected": -2.3140478134155273, "logps/chosen": -11.599485397338867, "logps/rejected": -11.585007667541504, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.04645271226763725, "rewards/margins": 0.01529378816485405, "rewards/rejected": 0.031158924102783203, "step": 140 }, { "epoch": 0.08, "learning_rate": 6.322869955156951e-08, "logits/chosen": -2.0849201679229736, "logits/rejected": -2.0949087142944336, "logps/chosen": -15.385628700256348, "logps/rejected": -9.008027076721191, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.04863996431231499, "rewards/margins": 0.03045968897640705, "rewards/rejected": 0.018180275335907936, "step": 141 }, { "epoch": 0.08, "learning_rate": 6.367713004484304e-08, "logits/chosen": -2.168081045150757, "logits/rejected": -2.168997049331665, "logps/chosen": -13.575304985046387, "logps/rejected": -8.40803337097168, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.07370863109827042, "rewards/margins": 0.02294921875, "rewards/rejected": 0.050759412348270416, "step": 142 }, { "epoch": 0.08, "learning_rate": 6.412556053811658e-08, "logits/chosen": -1.9823572635650635, "logits/rejected": -1.9902950525283813, "logps/chosen": -10.533583641052246, "logps/rejected": -8.484397888183594, "loss": 0.6725, "rewards/accuracies": 1.0, "rewards/chosen": 0.06901540607213974, "rewards/margins": 0.04163140803575516, "rewards/rejected": 0.027383996173739433, "step": 143 }, { "epoch": 0.08, "learning_rate": 6.457399103139013e-08, "logits/chosen": -2.1877636909484863, "logits/rejected": -2.288701295852661, "logps/chosen": -10.797584533691406, "logps/rejected": -9.468311309814453, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.035866737365722656, "rewards/margins": 0.007540702819824219, "rewards/rejected": 0.028326034545898438, "step": 144 }, { "epoch": 0.08, "learning_rate": 6.502242152466367e-08, "logits/chosen": -2.2022764682769775, "logits/rejected": -2.1997759342193604, "logps/chosen": -11.286023139953613, "logps/rejected": -8.382928848266602, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.02986316755414009, "rewards/margins": 0.0001493450254201889, "rewards/rejected": 0.029713822528719902, "step": 145 }, { "epoch": 0.08, "learning_rate": 6.547085201793722e-08, "logits/chosen": -2.250258684158325, "logits/rejected": -2.376148223876953, "logps/chosen": -10.523771286010742, "logps/rejected": -10.454777717590332, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.0446682944893837, "rewards/margins": 0.011810492724180222, "rewards/rejected": 0.032857801765203476, "step": 146 }, { "epoch": 0.08, "learning_rate": 6.591928251121076e-08, "logits/chosen": -2.0775792598724365, "logits/rejected": -2.329127073287964, "logps/chosen": -10.328490257263184, "logps/rejected": -10.934940338134766, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.04047899320721626, "rewards/margins": 0.0005448348820209503, "rewards/rejected": 0.03993415832519531, "step": 147 }, { "epoch": 0.08, "learning_rate": 6.63677130044843e-08, "logits/chosen": -2.106168508529663, "logits/rejected": -2.364114761352539, "logps/chosen": -10.163809776306152, "logps/rejected": -10.487626075744629, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.05947694927453995, "rewards/margins": 0.012815859168767929, "rewards/rejected": 0.04666109010577202, "step": 148 }, { "epoch": 0.08, "learning_rate": 6.681614349775785e-08, "logits/chosen": -2.1529171466827393, "logits/rejected": -2.1595458984375, "logps/chosen": -11.721691131591797, "logps/rejected": -8.452691078186035, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.05664539337158203, "rewards/margins": 0.034816645085811615, "rewards/rejected": 0.021828746423125267, "step": 149 }, { "epoch": 0.08, "learning_rate": 6.72645739910314e-08, "logits/chosen": -2.2259862422943115, "logits/rejected": -2.229917526245117, "logps/chosen": -8.751144409179688, "logps/rejected": -7.933202266693115, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.04215555265545845, "rewards/margins": 0.013655615970492363, "rewards/rejected": 0.028499936684966087, "step": 150 }, { "epoch": 0.08, "learning_rate": 6.771300448430492e-08, "logits/chosen": -2.2487564086914062, "logits/rejected": -2.2480525970458984, "logps/chosen": -15.655251502990723, "logps/rejected": -8.889596939086914, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 0.07068166881799698, "rewards/margins": 0.03562498092651367, "rewards/rejected": 0.03505668789148331, "step": 151 }, { "epoch": 0.08, "learning_rate": 6.816143497757847e-08, "logits/chosen": -2.156982660293579, "logits/rejected": -2.154498815536499, "logps/chosen": -19.48957633972168, "logps/rejected": -9.448293685913086, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.0635438933968544, "rewards/margins": 0.0166168212890625, "rewards/rejected": 0.0469270721077919, "step": 152 }, { "epoch": 0.08, "learning_rate": 6.860986547085201e-08, "logits/chosen": -2.1042230129241943, "logits/rejected": -2.3631629943847656, "logps/chosen": -18.502944946289062, "logps/rejected": -17.510501861572266, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.05424213409423828, "rewards/margins": 0.008861351758241653, "rewards/rejected": 0.04538078233599663, "step": 153 }, { "epoch": 0.08, "learning_rate": 6.905829596412556e-08, "logits/chosen": -2.15848708152771, "logits/rejected": -2.301358938217163, "logps/chosen": -9.790565490722656, "logps/rejected": -9.608199119567871, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.0463104248046875, "rewards/margins": -0.00406656414270401, "rewards/rejected": 0.05037698894739151, "step": 154 }, { "epoch": 0.08, "learning_rate": 6.95067264573991e-08, "logits/chosen": -2.108376979827881, "logits/rejected": -2.319633960723877, "logps/chosen": -7.985196113586426, "logps/rejected": -8.141115188598633, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.03291158750653267, "rewards/margins": 0.006400680169463158, "rewards/rejected": 0.02651090733706951, "step": 155 }, { "epoch": 0.08, "learning_rate": 6.995515695067265e-08, "logits/chosen": -2.129973888397217, "logits/rejected": -2.1300711631774902, "logps/chosen": -16.832103729248047, "logps/rejected": -9.577864646911621, "loss": 0.6624, "rewards/accuracies": 1.0, "rewards/chosen": 0.0929718017578125, "rewards/margins": 0.06252937018871307, "rewards/rejected": 0.030442429706454277, "step": 156 }, { "epoch": 0.08, "learning_rate": 7.040358744394619e-08, "logits/chosen": -2.1312131881713867, "logits/rejected": -2.2647225856781006, "logps/chosen": -12.799233436584473, "logps/rejected": -12.851602554321289, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": 0.05141744762659073, "rewards/margins": -0.018636129796504974, "rewards/rejected": 0.0700535774230957, "step": 157 }, { "epoch": 0.09, "learning_rate": 7.085201793721973e-08, "logits/chosen": -2.1365015506744385, "logits/rejected": -2.1432697772979736, "logps/chosen": -14.675183296203613, "logps/rejected": -22.069135665893555, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.07284622639417648, "rewards/margins": 0.03467312082648277, "rewards/rejected": 0.03817310556769371, "step": 158 }, { "epoch": 0.09, "learning_rate": 7.130044843049327e-08, "logits/chosen": -2.099518060684204, "logits/rejected": -2.1072678565979004, "logps/chosen": -15.344573974609375, "logps/rejected": -8.361824035644531, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.07089614868164062, "rewards/margins": 0.02613086625933647, "rewards/rejected": 0.04476528242230415, "step": 159 }, { "epoch": 0.09, "learning_rate": 7.174887892376681e-08, "logits/chosen": -2.095369577407837, "logits/rejected": -2.301807403564453, "logps/chosen": -11.176008224487305, "logps/rejected": -10.999109268188477, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.07377796620130539, "rewards/margins": 0.014893438667058945, "rewards/rejected": 0.058884527534246445, "step": 160 }, { "epoch": 0.09, "learning_rate": 7.219730941704035e-08, "logits/chosen": -2.058295488357544, "logits/rejected": -2.092419147491455, "logps/chosen": -15.98727035522461, "logps/rejected": -14.383010864257812, "loss": 0.6662, "rewards/accuracies": 1.0, "rewards/chosen": 0.07404708862304688, "rewards/margins": 0.05469474941492081, "rewards/rejected": 0.019352341070771217, "step": 161 }, { "epoch": 0.09, "learning_rate": 7.26457399103139e-08, "logits/chosen": -2.0980911254882812, "logits/rejected": -2.2929000854492188, "logps/chosen": -9.286234855651855, "logps/rejected": -9.24964714050293, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.057267189025878906, "rewards/margins": 0.009514424949884415, "rewards/rejected": 0.04775276407599449, "step": 162 }, { "epoch": 0.09, "learning_rate": 7.309417040358744e-08, "logits/chosen": -2.0591628551483154, "logits/rejected": -2.065683126449585, "logps/chosen": -9.759934425354004, "logps/rejected": -8.644917488098145, "loss": 0.6648, "rewards/accuracies": 1.0, "rewards/chosen": 0.080152228474617, "rewards/margins": 0.05755644291639328, "rewards/rejected": 0.022595787420868874, "step": 163 }, { "epoch": 0.09, "learning_rate": 7.354260089686099e-08, "logits/chosen": -2.0958950519561768, "logits/rejected": -2.094785451889038, "logps/chosen": -11.027265548706055, "logps/rejected": -9.304574966430664, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.06663360446691513, "rewards/margins": 0.017730042338371277, "rewards/rejected": 0.048903562128543854, "step": 164 }, { "epoch": 0.09, "learning_rate": 7.399103139013453e-08, "logits/chosen": -2.1579623222351074, "logits/rejected": -2.308048963546753, "logps/chosen": -9.929116249084473, "logps/rejected": -9.554590225219727, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.06245260313153267, "rewards/margins": -0.0044286735355854034, "rewards/rejected": 0.06688127666711807, "step": 165 }, { "epoch": 0.09, "learning_rate": 7.443946188340808e-08, "logits/chosen": -2.1558175086975098, "logits/rejected": -2.1637516021728516, "logps/chosen": -25.488584518432617, "logps/rejected": -8.185203552246094, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 0.08897800743579865, "rewards/margins": 0.046781446784734726, "rewards/rejected": 0.04219656065106392, "step": 166 }, { "epoch": 0.09, "learning_rate": 7.488789237668162e-08, "logits/chosen": -2.084242582321167, "logits/rejected": -2.265113592147827, "logps/chosen": -8.344563484191895, "logps/rejected": -8.12463092803955, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.0444093719124794, "rewards/margins": 0.005784034729003906, "rewards/rejected": 0.038625337183475494, "step": 167 }, { "epoch": 0.09, "learning_rate": 7.533632286995515e-08, "logits/chosen": -2.2380917072296143, "logits/rejected": -2.246823310852051, "logps/chosen": -15.23573112487793, "logps/rejected": -8.572559356689453, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.08827152103185654, "rewards/margins": 0.018443197011947632, "rewards/rejected": 0.0698283240199089, "step": 168 }, { "epoch": 0.09, "learning_rate": 7.57847533632287e-08, "logits/chosen": -2.052189350128174, "logits/rejected": -2.052978277206421, "logps/chosen": -17.721710205078125, "logps/rejected": -8.754632949829102, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 0.0850820541381836, "rewards/margins": 0.046297837048769, "rewards/rejected": 0.0387842170894146, "step": 169 }, { "epoch": 0.09, "learning_rate": 7.623318385650224e-08, "logits/chosen": -2.1357312202453613, "logits/rejected": -2.1386735439300537, "logps/chosen": -15.982126235961914, "logps/rejected": -12.424668312072754, "loss": 0.6675, "rewards/accuracies": 1.0, "rewards/chosen": 0.10352382808923721, "rewards/margins": 0.05206108093261719, "rewards/rejected": 0.051462747156620026, "step": 170 }, { "epoch": 0.09, "learning_rate": 7.668161434977578e-08, "logits/chosen": -2.0939395427703857, "logits/rejected": -2.397951364517212, "logps/chosen": -8.76603889465332, "logps/rejected": -8.922887802124023, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.06872053444385529, "rewards/margins": -0.01051950454711914, "rewards/rejected": 0.07924003899097443, "step": 171 }, { "epoch": 0.09, "learning_rate": 7.713004484304933e-08, "logits/chosen": -2.106034517288208, "logits/rejected": -2.115678071975708, "logps/chosen": -18.867578506469727, "logps/rejected": -14.149084091186523, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 0.10873718559741974, "rewards/margins": 0.04377460479736328, "rewards/rejected": 0.06496258080005646, "step": 172 }, { "epoch": 0.09, "learning_rate": 7.757847533632287e-08, "logits/chosen": -2.240757703781128, "logits/rejected": -2.2403478622436523, "logps/chosen": -17.686065673828125, "logps/rejected": -13.631692886352539, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.09600906819105148, "rewards/margins": 0.027386479079723358, "rewards/rejected": 0.06862258911132812, "step": 173 }, { "epoch": 0.09, "learning_rate": 7.802690582959642e-08, "logits/chosen": -2.0959324836730957, "logits/rejected": -2.3467681407928467, "logps/chosen": -8.98226261138916, "logps/rejected": -9.004222869873047, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.0933959037065506, "rewards/margins": 0.017311766743659973, "rewards/rejected": 0.07608413696289062, "step": 174 }, { "epoch": 0.09, "learning_rate": 7.847533632286996e-08, "logits/chosen": -2.1943130493164062, "logits/rejected": -2.334688186645508, "logps/chosen": -12.489972114562988, "logps/rejected": -12.135552406311035, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.07454891502857208, "rewards/margins": 0.0007824897766113281, "rewards/rejected": 0.07376642525196075, "step": 175 }, { "epoch": 0.09, "learning_rate": 7.892376681614349e-08, "logits/chosen": -2.1276724338531494, "logits/rejected": -2.302755355834961, "logps/chosen": -14.2986421585083, "logps/rejected": -13.783717155456543, "loss": 0.7078, "rewards/accuracies": 0.0, "rewards/chosen": 0.06320858001708984, "rewards/margins": -0.029034040868282318, "rewards/rejected": 0.09224262088537216, "step": 176 }, { "epoch": 0.1, "learning_rate": 7.937219730941704e-08, "logits/chosen": -2.182730197906494, "logits/rejected": -2.226109743118286, "logps/chosen": -17.741308212280273, "logps/rejected": -29.13077163696289, "loss": 0.6555, "rewards/accuracies": 1.0, "rewards/chosen": 0.1068599745631218, "rewards/margins": 0.07686920464038849, "rewards/rejected": 0.029990768060088158, "step": 177 }, { "epoch": 0.1, "learning_rate": 7.982062780269058e-08, "logits/chosen": -2.219367742538452, "logits/rejected": -2.3640599250793457, "logps/chosen": -16.72336196899414, "logps/rejected": -10.845973014831543, "loss": 0.7111, "rewards/accuracies": 0.0, "rewards/chosen": 0.0537836067378521, "rewards/margins": -0.03552732989192009, "rewards/rejected": 0.08931093662977219, "step": 178 }, { "epoch": 0.1, "learning_rate": 8.026905829596413e-08, "logits/chosen": -2.204779624938965, "logits/rejected": -2.2097256183624268, "logps/chosen": -10.334275245666504, "logps/rejected": -8.014403343200684, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.10271387547254562, "rewards/margins": 0.03123503178358078, "rewards/rejected": 0.07147884368896484, "step": 179 }, { "epoch": 0.1, "learning_rate": 8.071748878923767e-08, "logits/chosen": -2.238377809524536, "logits/rejected": -2.2376296520233154, "logps/chosen": -12.337857246398926, "logps/rejected": -10.025948524475098, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.09621258080005646, "rewards/margins": 0.02966461330652237, "rewards/rejected": 0.06654796749353409, "step": 180 }, { "epoch": 0.1, "learning_rate": 8.116591928251121e-08, "logits/chosen": -2.1805810928344727, "logits/rejected": -2.1736044883728027, "logps/chosen": -15.250764846801758, "logps/rejected": -12.970149993896484, "loss": 0.6597, "rewards/accuracies": 1.0, "rewards/chosen": 0.121923066675663, "rewards/margins": 0.0680398941040039, "rewards/rejected": 0.05388317257165909, "step": 181 }, { "epoch": 0.1, "learning_rate": 8.161434977578476e-08, "logits/chosen": -2.0260283946990967, "logits/rejected": -2.0080153942108154, "logps/chosen": -24.82144546508789, "logps/rejected": -12.90349006652832, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.07483138889074326, "rewards/margins": -0.012664034962654114, "rewards/rejected": 0.08749542385339737, "step": 182 }, { "epoch": 0.1, "learning_rate": 8.206278026905829e-08, "logits/chosen": -2.045006036758423, "logits/rejected": -2.293363571166992, "logps/chosen": -7.759705066680908, "logps/rejected": -7.686592102050781, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.0799499973654747, "rewards/margins": -0.0047380924224853516, "rewards/rejected": 0.08468808978796005, "step": 183 }, { "epoch": 0.1, "learning_rate": 8.251121076233183e-08, "logits/chosen": -2.0007314682006836, "logits/rejected": -2.001803398132324, "logps/chosen": -9.32823657989502, "logps/rejected": -7.478575706481934, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.11934423446655273, "rewards/margins": 0.008007906377315521, "rewards/rejected": 0.11133632808923721, "step": 184 }, { "epoch": 0.1, "learning_rate": 8.295964125560538e-08, "logits/chosen": -2.0931529998779297, "logits/rejected": -2.2708652019500732, "logps/chosen": -8.95576286315918, "logps/rejected": -19.386049270629883, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.09075450897216797, "rewards/margins": 0.03824787214398384, "rewards/rejected": 0.05250663682818413, "step": 185 }, { "epoch": 0.1, "learning_rate": 8.340807174887892e-08, "logits/chosen": -2.112194299697876, "logits/rejected": -2.312469005584717, "logps/chosen": -11.390575408935547, "logps/rejected": -11.440206527709961, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.08471622318029404, "rewards/margins": 0.005144208669662476, "rewards/rejected": 0.07957201451063156, "step": 186 }, { "epoch": 0.1, "learning_rate": 8.385650224215247e-08, "logits/chosen": -2.062453508377075, "logits/rejected": -2.2900960445404053, "logps/chosen": -9.363770484924316, "logps/rejected": -9.502544403076172, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.13506117463111877, "rewards/margins": 0.025108054280281067, "rewards/rejected": 0.10995312035083771, "step": 187 }, { "epoch": 0.1, "learning_rate": 8.4304932735426e-08, "logits/chosen": -2.2965354919433594, "logits/rejected": -2.3692684173583984, "logps/chosen": -9.42007827758789, "logps/rejected": -9.53997802734375, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.0570768378674984, "rewards/margins": 0.019116021692752838, "rewards/rejected": 0.03796081617474556, "step": 188 }, { "epoch": 0.1, "learning_rate": 8.475336322869954e-08, "logits/chosen": -2.2516720294952393, "logits/rejected": -2.251451253890991, "logps/chosen": -8.240342140197754, "logps/rejected": -11.273981094360352, "loss": 0.6643, "rewards/accuracies": 1.0, "rewards/chosen": 0.13470296561717987, "rewards/margins": 0.058566950261592865, "rewards/rejected": 0.076136015355587, "step": 189 }, { "epoch": 0.1, "learning_rate": 8.520179372197309e-08, "logits/chosen": -2.1496667861938477, "logits/rejected": -2.2132110595703125, "logps/chosen": -8.318879127502441, "logps/rejected": -25.141511917114258, "loss": 0.6155, "rewards/accuracies": 1.0, "rewards/chosen": 0.16830264031887054, "rewards/margins": 0.16181498765945435, "rewards/rejected": 0.00648765591904521, "step": 190 }, { "epoch": 0.1, "learning_rate": 8.565022421524663e-08, "logits/chosen": -2.075225353240967, "logits/rejected": -2.268364429473877, "logps/chosen": -8.823890686035156, "logps/rejected": -8.46231460571289, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.10824737697839737, "rewards/margins": -0.001315973699092865, "rewards/rejected": 0.10956335067749023, "step": 191 }, { "epoch": 0.1, "learning_rate": 8.609865470852018e-08, "logits/chosen": -2.1329903602600098, "logits/rejected": -2.305708646774292, "logps/chosen": -7.868807315826416, "logps/rejected": -7.682371139526367, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.13749785721302032, "rewards/margins": 0.03256268799304962, "rewards/rejected": 0.1049351692199707, "step": 192 }, { "epoch": 0.1, "learning_rate": 8.65470852017937e-08, "logits/chosen": -2.2399702072143555, "logits/rejected": -2.2428019046783447, "logps/chosen": -8.448199272155762, "logps/rejected": -7.4341349601745605, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.11222954094409943, "rewards/margins": 0.02268987149000168, "rewards/rejected": 0.08953966945409775, "step": 193 }, { "epoch": 0.1, "learning_rate": 8.699551569506725e-08, "logits/chosen": -2.221921682357788, "logits/rejected": -2.224247932434082, "logps/chosen": -8.1591215133667, "logps/rejected": -7.578027248382568, "loss": 0.6383, "rewards/accuracies": 1.0, "rewards/chosen": 0.1545828878879547, "rewards/margins": 0.11280208081007004, "rewards/rejected": 0.041780807077884674, "step": 194 }, { "epoch": 0.11, "learning_rate": 8.74439461883408e-08, "logits/chosen": -2.2043297290802, "logits/rejected": -2.208400011062622, "logps/chosen": -9.28261661529541, "logps/rejected": -10.672384262084961, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.10854349285364151, "rewards/margins": 0.02508249133825302, "rewards/rejected": 0.08346100151538849, "step": 195 }, { "epoch": 0.11, "learning_rate": 8.789237668161434e-08, "logits/chosen": -2.0927443504333496, "logits/rejected": -2.130547285079956, "logps/chosen": -19.540691375732422, "logps/rejected": -13.452997207641602, "loss": 0.6668, "rewards/accuracies": 1.0, "rewards/chosen": 0.13068810105323792, "rewards/margins": 0.05333023518323898, "rewards/rejected": 0.07735786586999893, "step": 196 }, { "epoch": 0.11, "learning_rate": 8.834080717488788e-08, "logits/chosen": -2.0194625854492188, "logits/rejected": -2.021169900894165, "logps/chosen": -16.356491088867188, "logps/rejected": -8.420969009399414, "loss": 0.7021, "rewards/accuracies": 0.0, "rewards/chosen": 0.11533355712890625, "rewards/margins": -0.01789073646068573, "rewards/rejected": 0.13322429358959198, "step": 197 }, { "epoch": 0.11, "learning_rate": 8.878923766816143e-08, "logits/chosen": -2.1905972957611084, "logits/rejected": -2.30155348777771, "logps/chosen": -8.529006004333496, "logps/rejected": -13.14638614654541, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.12336378544569016, "rewards/margins": 0.019808389246463776, "rewards/rejected": 0.10355539619922638, "step": 198 }, { "epoch": 0.11, "learning_rate": 8.923766816143497e-08, "logits/chosen": -2.0473885536193848, "logits/rejected": -2.3139517307281494, "logps/chosen": -8.350235939025879, "logps/rejected": -8.273663520812988, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.1587628424167633, "rewards/margins": 0.014331623911857605, "rewards/rejected": 0.1444312185049057, "step": 199 }, { "epoch": 0.11, "learning_rate": 8.968609865470852e-08, "logits/chosen": -2.193572759628296, "logits/rejected": -2.176440715789795, "logps/chosen": -24.67461395263672, "logps/rejected": -8.571072578430176, "loss": 0.6122, "rewards/accuracies": 1.0, "rewards/chosen": 0.2385379821062088, "rewards/margins": 0.16902047395706177, "rewards/rejected": 0.06951751559972763, "step": 200 }, { "epoch": 0.11, "learning_rate": 9.013452914798206e-08, "logits/chosen": -2.079859495162964, "logits/rejected": -2.0645570755004883, "logps/chosen": -18.583797454833984, "logps/rejected": -8.554998397827148, "loss": 0.6698, "rewards/accuracies": 1.0, "rewards/chosen": 0.13056297600269318, "rewards/margins": 0.0472416877746582, "rewards/rejected": 0.08332128822803497, "step": 201 }, { "epoch": 0.11, "learning_rate": 9.058295964125559e-08, "logits/chosen": -2.089779853820801, "logits/rejected": -2.1030025482177734, "logps/chosen": -13.038980484008789, "logps/rejected": -12.696317672729492, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.11188793182373047, "rewards/margins": 0.0043122246861457825, "rewards/rejected": 0.10757570713758469, "step": 202 }, { "epoch": 0.11, "learning_rate": 9.103139013452914e-08, "logits/chosen": -2.0017905235290527, "logits/rejected": -2.296097993850708, "logps/chosen": -7.963188171386719, "logps/rejected": -7.729087829589844, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.13587455451488495, "rewards/margins": 0.017063520848751068, "rewards/rejected": 0.11881103366613388, "step": 203 }, { "epoch": 0.11, "learning_rate": 9.147982062780268e-08, "logits/chosen": -2.0710060596466064, "logits/rejected": -2.2767767906188965, "logps/chosen": -8.910884857177734, "logps/rejected": -9.08833122253418, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.17084971070289612, "rewards/margins": 0.009906873106956482, "rewards/rejected": 0.16094283759593964, "step": 204 }, { "epoch": 0.11, "learning_rate": 9.192825112107622e-08, "logits/chosen": -2.0402278900146484, "logits/rejected": -2.0402493476867676, "logps/chosen": -11.038456916809082, "logps/rejected": -9.81595230102539, "loss": 0.7221, "rewards/accuracies": 0.0, "rewards/chosen": 0.10874862968921661, "rewards/margins": -0.05717630684375763, "rewards/rejected": 0.16592493653297424, "step": 205 }, { "epoch": 0.11, "learning_rate": 9.237668161434977e-08, "logits/chosen": -2.237215518951416, "logits/rejected": -2.2646374702453613, "logps/chosen": -34.94947814941406, "logps/rejected": -26.042064666748047, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.10784988850355148, "rewards/margins": 0.023770146071910858, "rewards/rejected": 0.08407974243164062, "step": 206 }, { "epoch": 0.11, "learning_rate": 9.282511210762331e-08, "logits/chosen": -2.1324493885040283, "logits/rejected": -2.3191416263580322, "logps/chosen": -8.769583702087402, "logps/rejected": -8.45001220703125, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.1174384132027626, "rewards/margins": -0.0025734901428222656, "rewards/rejected": 0.12001190334558487, "step": 207 }, { "epoch": 0.11, "learning_rate": 9.327354260089686e-08, "logits/chosen": -2.110041856765747, "logits/rejected": -2.1037211418151855, "logps/chosen": -11.806970596313477, "logps/rejected": -8.80484390258789, "loss": 0.709, "rewards/accuracies": 0.0, "rewards/chosen": 0.11496849358081818, "rewards/margins": -0.031439393758773804, "rewards/rejected": 0.14640788733959198, "step": 208 }, { "epoch": 0.11, "learning_rate": 9.37219730941704e-08, "logits/chosen": -2.1176414489746094, "logits/rejected": -2.282075881958008, "logps/chosen": -13.59089183807373, "logps/rejected": -10.450516700744629, "loss": 0.7069, "rewards/accuracies": 0.0, "rewards/chosen": 0.13928423821926117, "rewards/margins": -0.02738790214061737, "rewards/rejected": 0.16667214035987854, "step": 209 }, { "epoch": 0.11, "learning_rate": 9.417040358744395e-08, "logits/chosen": -2.0132758617401123, "logits/rejected": -2.02662992477417, "logps/chosen": -14.227462768554688, "logps/rejected": -13.853815078735352, "loss": 0.6591, "rewards/accuracies": 1.0, "rewards/chosen": 0.13005809485912323, "rewards/margins": 0.06925392150878906, "rewards/rejected": 0.060804177075624466, "step": 210 }, { "epoch": 0.11, "learning_rate": 9.461883408071748e-08, "logits/chosen": -2.1436941623687744, "logits/rejected": -2.159727096557617, "logps/chosen": -20.3396053314209, "logps/rejected": -10.538317680358887, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": 0.19419975578784943, "rewards/margins": 0.07412862777709961, "rewards/rejected": 0.12007112801074982, "step": 211 }, { "epoch": 0.11, "learning_rate": 9.506726457399102e-08, "logits/chosen": -2.1240808963775635, "logits/rejected": -2.114011526107788, "logps/chosen": -10.133874893188477, "logps/rejected": -8.915216445922852, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.14427052438259125, "rewards/margins": 0.014631658792495728, "rewards/rejected": 0.12963886559009552, "step": 212 }, { "epoch": 0.11, "learning_rate": 9.551569506726457e-08, "logits/chosen": -2.142681360244751, "logits/rejected": -2.1122539043426514, "logps/chosen": -30.398822784423828, "logps/rejected": -8.737654685974121, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.10628338158130646, "rewards/margins": -0.014767266809940338, "rewards/rejected": 0.1210506483912468, "step": 213 }, { "epoch": 0.12, "learning_rate": 9.596412556053811e-08, "logits/chosen": -2.1852049827575684, "logits/rejected": -2.1876866817474365, "logps/chosen": -9.9547119140625, "logps/rejected": -8.377635955810547, "loss": 0.6688, "rewards/accuracies": 1.0, "rewards/chosen": 0.1258411407470703, "rewards/margins": 0.04922495037317276, "rewards/rejected": 0.07661619037389755, "step": 214 }, { "epoch": 0.12, "learning_rate": 9.641255605381165e-08, "logits/chosen": -2.129840612411499, "logits/rejected": -2.137751579284668, "logps/chosen": -12.272473335266113, "logps/rejected": -8.409876823425293, "loss": 0.6509, "rewards/accuracies": 1.0, "rewards/chosen": 0.20015068352222443, "rewards/margins": 0.08625993877649307, "rewards/rejected": 0.11389074474573135, "step": 215 }, { "epoch": 0.12, "learning_rate": 9.68609865470852e-08, "logits/chosen": -2.0912771224975586, "logits/rejected": -2.287191390991211, "logps/chosen": -11.70600700378418, "logps/rejected": -9.075569152832031, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.1347360610961914, "rewards/margins": 0.011599823832511902, "rewards/rejected": 0.1231362372636795, "step": 216 }, { "epoch": 0.12, "learning_rate": 9.730941704035874e-08, "logits/chosen": -2.1050357818603516, "logits/rejected": -2.106595754623413, "logps/chosen": -12.941492080688477, "logps/rejected": -9.064411163330078, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": 0.18914900720119476, "rewards/margins": 0.057101160287857056, "rewards/rejected": 0.1320478469133377, "step": 217 }, { "epoch": 0.12, "learning_rate": 9.775784753363229e-08, "logits/chosen": -2.207892894744873, "logits/rejected": -2.1921050548553467, "logps/chosen": -27.030921936035156, "logps/rejected": -8.863809585571289, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.1937919706106186, "rewards/margins": -0.01382569968700409, "rewards/rejected": 0.20761767029762268, "step": 218 }, { "epoch": 0.12, "learning_rate": 9.820627802690582e-08, "logits/chosen": -2.1833336353302, "logits/rejected": -2.302905797958374, "logps/chosen": -11.079547882080078, "logps/rejected": -14.80655288696289, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.16338662803173065, "rewards/margins": 0.009508028626441956, "rewards/rejected": 0.1538785994052887, "step": 219 }, { "epoch": 0.12, "learning_rate": 9.865470852017936e-08, "logits/chosen": -2.1768829822540283, "logits/rejected": -2.226980209350586, "logps/chosen": -26.996854782104492, "logps/rejected": -27.486835479736328, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.13653507828712463, "rewards/margins": -0.007464021444320679, "rewards/rejected": 0.1439990997314453, "step": 220 }, { "epoch": 0.12, "learning_rate": 9.910313901345291e-08, "logits/chosen": -2.0876660346984863, "logits/rejected": -2.0956616401672363, "logps/chosen": -9.590291976928711, "logps/rejected": -7.434976100921631, "loss": 0.6488, "rewards/accuracies": 1.0, "rewards/chosen": 0.2264888733625412, "rewards/margins": 0.09072260558605194, "rewards/rejected": 0.13576626777648926, "step": 221 }, { "epoch": 0.12, "learning_rate": 9.955156950672645e-08, "logits/chosen": -2.149467706680298, "logits/rejected": -2.309718608856201, "logps/chosen": -16.463584899902344, "logps/rejected": -8.653313636779785, "loss": 0.7098, "rewards/accuracies": 0.0, "rewards/chosen": 0.127177432179451, "rewards/margins": -0.03304585814476013, "rewards/rejected": 0.16022329032421112, "step": 222 }, { "epoch": 0.12, "learning_rate": 1e-07, "logits/chosen": -2.145925760269165, "logits/rejected": -2.3519833087921143, "logps/chosen": -11.560026168823242, "logps/rejected": -11.469268798828125, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.1666887253522873, "rewards/margins": -0.012008190155029297, "rewards/rejected": 0.1786969155073166, "step": 223 }, { "epoch": 0.12, "learning_rate": 9.999999523108451e-08, "logits/chosen": -2.228219985961914, "logits/rejected": -2.2391278743743896, "logps/chosen": -13.891578674316406, "logps/rejected": -7.246333599090576, "loss": 0.6548, "rewards/accuracies": 1.0, "rewards/chosen": 0.22645436227321625, "rewards/margins": 0.07826915383338928, "rewards/rejected": 0.14818520843982697, "step": 224 }, { "epoch": 0.12, "learning_rate": 9.999998092433895e-08, "logits/chosen": -2.0949177742004395, "logits/rejected": -2.299293279647827, "logps/chosen": -8.629185676574707, "logps/rejected": -8.139324188232422, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.20178432762622833, "rewards/margins": -0.005161672830581665, "rewards/rejected": 0.20694600045681, "step": 225 }, { "epoch": 0.12, "learning_rate": 9.999995707976603e-08, "logits/chosen": -2.094233989715576, "logits/rejected": -2.0993762016296387, "logps/chosen": -10.977816581726074, "logps/rejected": -7.29702615737915, "loss": 0.6406, "rewards/accuracies": 1.0, "rewards/chosen": 0.2638051211833954, "rewards/margins": 0.10791231691837311, "rewards/rejected": 0.15589280426502228, "step": 226 }, { "epoch": 0.12, "learning_rate": 9.999992369737032e-08, "logits/chosen": -2.0961174964904785, "logits/rejected": -2.264272451400757, "logps/chosen": -10.496095657348633, "logps/rejected": -11.962312698364258, "loss": 0.7015, "rewards/accuracies": 0.0, "rewards/chosen": 0.17039556801319122, "rewards/margins": -0.016631990671157837, "rewards/rejected": 0.18702755868434906, "step": 227 }, { "epoch": 0.12, "learning_rate": 9.999988077715818e-08, "logits/chosen": -2.1676015853881836, "logits/rejected": -2.169790029525757, "logps/chosen": -7.991328239440918, "logps/rejected": -8.326371192932129, "loss": 0.658, "rewards/accuracies": 1.0, "rewards/chosen": 0.15065088868141174, "rewards/margins": 0.07152910530567169, "rewards/rejected": 0.07912178337574005, "step": 228 }, { "epoch": 0.12, "learning_rate": 9.99998283191378e-08, "logits/chosen": -2.227374792098999, "logits/rejected": -2.370579481124878, "logps/chosen": -10.655989646911621, "logps/rejected": -10.612284660339355, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 0.13422928750514984, "rewards/margins": -0.00990857183933258, "rewards/rejected": 0.14413785934448242, "step": 229 }, { "epoch": 0.12, "learning_rate": 9.999976632331919e-08, "logits/chosen": -2.210545778274536, "logits/rejected": -2.2107558250427246, "logps/chosen": -9.408897399902344, "logps/rejected": -9.551576614379883, "loss": 0.6576, "rewards/accuracies": 1.0, "rewards/chosen": 0.19121809303760529, "rewards/margins": 0.07242679595947266, "rewards/rejected": 0.11879129707813263, "step": 230 }, { "epoch": 0.12, "learning_rate": 9.999969478971416e-08, "logits/chosen": -2.1676905155181885, "logits/rejected": -2.314028739929199, "logps/chosen": -7.99524450302124, "logps/rejected": -7.91736364364624, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.19553914666175842, "rewards/margins": -0.004261866211891174, "rewards/rejected": 0.1998010128736496, "step": 231 }, { "epoch": 0.13, "learning_rate": 9.999961371833637e-08, "logits/chosen": -2.160825252532959, "logits/rejected": -2.162659168243408, "logps/chosen": -19.458114624023438, "logps/rejected": -6.583054542541504, "loss": 0.6199, "rewards/accuracies": 1.0, "rewards/chosen": 0.3167709410190582, "rewards/margins": 0.15234442055225372, "rewards/rejected": 0.1644265204668045, "step": 232 }, { "epoch": 0.13, "learning_rate": 9.999952310920129e-08, "logits/chosen": -2.076399564743042, "logits/rejected": -2.266939878463745, "logps/chosen": -7.237772464752197, "logps/rejected": -7.143137454986572, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.25612932443618774, "rewards/margins": 0.012925058603286743, "rewards/rejected": 0.243204265832901, "step": 233 }, { "epoch": 0.13, "learning_rate": 9.999942296232619e-08, "logits/chosen": -2.238236665725708, "logits/rejected": -2.2450709342956543, "logps/chosen": -9.608499526977539, "logps/rejected": -6.691807746887207, "loss": 0.6273, "rewards/accuracies": 1.0, "rewards/chosen": 0.3181905746459961, "rewards/margins": 0.1363285928964615, "rewards/rejected": 0.1818619817495346, "step": 234 }, { "epoch": 0.13, "learning_rate": 9.99993132777302e-08, "logits/chosen": -2.0109870433807373, "logits/rejected": -2.279961109161377, "logps/chosen": -6.99339485168457, "logps/rejected": -6.7855424880981445, "loss": 0.6982, "rewards/accuracies": 0.0, "rewards/chosen": 0.12493228912353516, "rewards/margins": -0.010034367442131042, "rewards/rejected": 0.1349666565656662, "step": 235 }, { "epoch": 0.13, "learning_rate": 9.999919405543419e-08, "logits/chosen": -2.1590230464935303, "logits/rejected": -2.1356749534606934, "logps/chosen": -31.590980529785156, "logps/rejected": -9.506571769714355, "loss": 0.6299, "rewards/accuracies": 1.0, "rewards/chosen": 0.24283944070339203, "rewards/margins": 0.13080883026123047, "rewards/rejected": 0.11203060299158096, "step": 236 }, { "epoch": 0.13, "learning_rate": 9.999906529546094e-08, "logits/chosen": -2.0742247104644775, "logits/rejected": -2.0805022716522217, "logps/chosen": -19.66790771484375, "logps/rejected": -20.224210739135742, "loss": 0.5967, "rewards/accuracies": 1.0, "rewards/chosen": 0.30900058150291443, "rewards/margins": 0.20321577787399292, "rewards/rejected": 0.10578479617834091, "step": 237 }, { "epoch": 0.13, "learning_rate": 9.999892699783502e-08, "logits/chosen": -1.9997715950012207, "logits/rejected": -2.0025475025177, "logps/chosen": -14.203981399536133, "logps/rejected": -11.42073917388916, "loss": 0.637, "rewards/accuracies": 1.0, "rewards/chosen": 0.2702995240688324, "rewards/margins": 0.11564645171165466, "rewards/rejected": 0.15465307235717773, "step": 238 }, { "epoch": 0.13, "learning_rate": 9.999877916258279e-08, "logits/chosen": -2.1434226036071777, "logits/rejected": -2.1413848400115967, "logps/chosen": -13.27574348449707, "logps/rejected": -9.623025894165039, "loss": 0.6462, "rewards/accuracies": 1.0, "rewards/chosen": 0.23066721856594086, "rewards/margins": 0.09622985124588013, "rewards/rejected": 0.13443736732006073, "step": 239 }, { "epoch": 0.13, "learning_rate": 9.999862178973246e-08, "logits/chosen": -2.087710380554199, "logits/rejected": -2.0870840549468994, "logps/chosen": -14.926244735717773, "logps/rejected": -8.968109130859375, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.22769871354103088, "rewards/margins": 0.03438816964626312, "rewards/rejected": 0.19331054389476776, "step": 240 }, { "epoch": 0.13, "learning_rate": 9.999845487931406e-08, "logits/chosen": -2.0959346294403076, "logits/rejected": -2.0876657962799072, "logps/chosen": -14.669618606567383, "logps/rejected": -8.311982154846191, "loss": 0.6258, "rewards/accuracies": 1.0, "rewards/chosen": 0.2989116609096527, "rewards/margins": 0.13959349691867828, "rewards/rejected": 0.15931816399097443, "step": 241 }, { "epoch": 0.13, "learning_rate": 9.999827843135941e-08, "logits/chosen": -2.136390447616577, "logits/rejected": -2.1370034217834473, "logps/chosen": -8.369855880737305, "logps/rejected": -8.058488845825195, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.26708871126174927, "rewards/margins": 0.03531494736671448, "rewards/rejected": 0.2317737638950348, "step": 242 }, { "epoch": 0.13, "learning_rate": 9.999809244590217e-08, "logits/chosen": -2.038872003555298, "logits/rejected": -2.034085273742676, "logps/chosen": -9.981663703918457, "logps/rejected": -7.419421195983887, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": 0.20186395943164825, "rewards/margins": 0.06875009834766388, "rewards/rejected": 0.13311386108398438, "step": 243 }, { "epoch": 0.13, "learning_rate": 9.999789692297783e-08, "logits/chosen": -2.1921546459198, "logits/rejected": -2.1916370391845703, "logps/chosen": -34.5789909362793, "logps/rejected": -11.838483810424805, "loss": 0.6313, "rewards/accuracies": 1.0, "rewards/chosen": 0.23464012145996094, "rewards/margins": 0.12775735557079315, "rewards/rejected": 0.10688276588916779, "step": 244 }, { "epoch": 0.13, "learning_rate": 9.999769186262368e-08, "logits/chosen": -2.119488477706909, "logits/rejected": -2.2721877098083496, "logps/chosen": -6.430060386657715, "logps/rejected": -6.453800201416016, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.29887866973876953, "rewards/margins": 0.01163625717163086, "rewards/rejected": 0.28724241256713867, "step": 245 }, { "epoch": 0.13, "learning_rate": 9.999747726487884e-08, "logits/chosen": -2.085439920425415, "logits/rejected": -2.0830044746398926, "logps/chosen": -19.108749389648438, "logps/rejected": -8.209073066711426, "loss": 0.6434, "rewards/accuracies": 1.0, "rewards/chosen": 0.329759418964386, "rewards/margins": 0.10203029215335846, "rewards/rejected": 0.22772912681102753, "step": 246 }, { "epoch": 0.13, "learning_rate": 9.999725312978424e-08, "logits/chosen": -2.1719515323638916, "logits/rejected": -2.177116632461548, "logps/chosen": -10.691566467285156, "logps/rejected": -14.505273818969727, "loss": 0.6271, "rewards/accuracies": 1.0, "rewards/chosen": 0.3192365765571594, "rewards/margins": 0.1368148922920227, "rewards/rejected": 0.18242168426513672, "step": 247 }, { "epoch": 0.13, "learning_rate": 9.999701945738265e-08, "logits/chosen": -2.109052896499634, "logits/rejected": -2.116290807723999, "logps/chosen": -10.565057754516602, "logps/rejected": -8.02202320098877, "loss": 0.5834, "rewards/accuracies": 1.0, "rewards/chosen": 0.4250354766845703, "rewards/margins": 0.23296155035495758, "rewards/rejected": 0.19207392632961273, "step": 248 }, { "epoch": 0.13, "learning_rate": 9.999677624771862e-08, "logits/chosen": -2.1756112575531006, "logits/rejected": -2.1838953495025635, "logps/chosen": -8.73164176940918, "logps/rejected": -7.1504716873168945, "loss": 0.615, "rewards/accuracies": 1.0, "rewards/chosen": 0.36036187410354614, "rewards/margins": 0.16294251382350922, "rewards/rejected": 0.19741936028003693, "step": 249 }, { "epoch": 0.13, "learning_rate": 9.999652350083857e-08, "logits/chosen": -2.034069776535034, "logits/rejected": -2.044973373413086, "logps/chosen": -10.897028923034668, "logps/rejected": -6.762753963470459, "loss": 0.5896, "rewards/accuracies": 1.0, "rewards/chosen": 0.3959435522556305, "rewards/margins": 0.21900954842567444, "rewards/rejected": 0.17693400382995605, "step": 250 }, { "epoch": 0.14, "learning_rate": 9.99962612167907e-08, "logits/chosen": -2.0435750484466553, "logits/rejected": -2.044659376144409, "logps/chosen": -5.782021522521973, "logps/rejected": -8.184271812438965, "loss": 0.6121, "rewards/accuracies": 1.0, "rewards/chosen": 0.33644112944602966, "rewards/margins": 0.16934174299240112, "rewards/rejected": 0.16709938645362854, "step": 251 }, { "epoch": 0.14, "learning_rate": 9.999598939562503e-08, "logits/chosen": -2.1299891471862793, "logits/rejected": -2.188133478164673, "logps/chosen": -16.194358825683594, "logps/rejected": -21.671688079833984, "loss": 0.5821, "rewards/accuracies": 1.0, "rewards/chosen": 0.3085647523403168, "rewards/margins": 0.236066997051239, "rewards/rejected": 0.07249774783849716, "step": 252 }, { "epoch": 0.14, "learning_rate": 9.999570803739344e-08, "logits/chosen": -2.140599489212036, "logits/rejected": -2.125805616378784, "logps/chosen": -13.17194652557373, "logps/rejected": -10.088711738586426, "loss": 0.6265, "rewards/accuracies": 1.0, "rewards/chosen": 0.25603124499320984, "rewards/margins": 0.13797885179519653, "rewards/rejected": 0.11805238574743271, "step": 253 }, { "epoch": 0.14, "learning_rate": 9.999541714214958e-08, "logits/chosen": -1.9578945636749268, "logits/rejected": -1.965614914894104, "logps/chosen": -10.346663475036621, "logps/rejected": -9.065790176391602, "loss": 0.6285, "rewards/accuracies": 1.0, "rewards/chosen": 0.27706384658813477, "rewards/margins": 0.13369417190551758, "rewards/rejected": 0.1433696746826172, "step": 254 }, { "epoch": 0.14, "learning_rate": 9.999511670994895e-08, "logits/chosen": -2.136137008666992, "logits/rejected": -2.1384212970733643, "logps/chosen": -11.83320140838623, "logps/rejected": -7.824575901031494, "loss": 0.6621, "rewards/accuracies": 1.0, "rewards/chosen": 0.3637530505657196, "rewards/margins": 0.063052237033844, "rewards/rejected": 0.3007008135318756, "step": 255 }, { "epoch": 0.14, "learning_rate": 9.999480674084886e-08, "logits/chosen": -2.2617759704589844, "logits/rejected": -2.5018889904022217, "logps/chosen": -20.743576049804688, "logps/rejected": -35.22486877441406, "loss": 0.7052, "rewards/accuracies": 0.0, "rewards/chosen": 0.09291458129882812, "rewards/margins": -0.023998260498046875, "rewards/rejected": 0.116912841796875, "step": 256 }, { "epoch": 0.14, "learning_rate": 9.999448723490842e-08, "logits/chosen": -2.2062032222747803, "logits/rejected": -2.2078909873962402, "logps/chosen": -16.394041061401367, "logps/rejected": -7.602574825286865, "loss": 0.6325, "rewards/accuracies": 1.0, "rewards/chosen": 0.2747560441493988, "rewards/margins": 0.1252363622188568, "rewards/rejected": 0.149519681930542, "step": 257 }, { "epoch": 0.14, "learning_rate": 9.999415819218859e-08, "logits/chosen": -2.069439172744751, "logits/rejected": -2.0670013427734375, "logps/chosen": -16.95237159729004, "logps/rejected": -7.1795654296875, "loss": 0.6024, "rewards/accuracies": 1.0, "rewards/chosen": 0.36707744002342224, "rewards/margins": 0.19062460958957672, "rewards/rejected": 0.17645283043384552, "step": 258 }, { "epoch": 0.14, "learning_rate": 9.999381961275215e-08, "logits/chosen": -1.9901835918426514, "logits/rejected": -2.291182279586792, "logps/chosen": -6.723194122314453, "logps/rejected": -6.923426151275635, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.4024503827095032, "rewards/margins": 0.015597671270370483, "rewards/rejected": 0.3868527114391327, "step": 259 }, { "epoch": 0.14, "learning_rate": 9.999347149666369e-08, "logits/chosen": -2.0801727771759033, "logits/rejected": -2.080463409423828, "logps/chosen": -13.111412048339844, "logps/rejected": -7.668540000915527, "loss": 0.6498, "rewards/accuracies": 1.0, "rewards/chosen": 0.4127269685268402, "rewards/margins": 0.08875149488449097, "rewards/rejected": 0.32397547364234924, "step": 260 }, { "epoch": 0.14, "learning_rate": 9.999311384398958e-08, "logits/chosen": -2.164205551147461, "logits/rejected": -2.2697839736938477, "logps/chosen": -7.082120895385742, "logps/rejected": -7.181486129760742, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.3810744285583496, "rewards/margins": -0.0016356408596038818, "rewards/rejected": 0.3827100694179535, "step": 261 }, { "epoch": 0.14, "learning_rate": 9.999274665479807e-08, "logits/chosen": -2.152682304382324, "logits/rejected": -2.1654646396636963, "logps/chosen": -18.17397689819336, "logps/rejected": -8.990263938903809, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.24806824326515198, "rewards/margins": 0.0333930104970932, "rewards/rejected": 0.21467523276805878, "step": 262 }, { "epoch": 0.14, "learning_rate": 9.99923699291592e-08, "logits/chosen": -2.0885064601898193, "logits/rejected": -2.3172099590301514, "logps/chosen": -8.708243370056152, "logps/rejected": -8.47400951385498, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.2463923543691635, "rewards/margins": -0.00233401358127594, "rewards/rejected": 0.24872636795043945, "step": 263 }, { "epoch": 0.14, "learning_rate": 9.999198366714483e-08, "logits/chosen": -2.1486778259277344, "logits/rejected": -2.2995223999023438, "logps/chosen": -5.382397651672363, "logps/rejected": -5.048969268798828, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.2176777422428131, "rewards/margins": 0.004256114363670349, "rewards/rejected": 0.21342162787914276, "step": 264 }, { "epoch": 0.14, "learning_rate": 9.999158786882865e-08, "logits/chosen": -2.004544973373413, "logits/rejected": -1.998400092124939, "logps/chosen": -8.89130687713623, "logps/rejected": -9.023113250732422, "loss": 0.6366, "rewards/accuracies": 1.0, "rewards/chosen": 0.33308419585227966, "rewards/margins": 0.11640967428684235, "rewards/rejected": 0.21667452156543732, "step": 265 }, { "epoch": 0.14, "learning_rate": 9.999118253428616e-08, "logits/chosen": -2.0840702056884766, "logits/rejected": -2.3383820056915283, "logps/chosen": -12.50151252746582, "logps/rejected": -12.251004219055176, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.26393088698387146, "rewards/margins": -0.004131436347961426, "rewards/rejected": 0.2680623233318329, "step": 266 }, { "epoch": 0.14, "learning_rate": 9.999076766359466e-08, "logits/chosen": -2.0344722270965576, "logits/rejected": -2.032658338546753, "logps/chosen": -9.462345123291016, "logps/rejected": -8.586137771606445, "loss": 0.5946, "rewards/accuracies": 1.0, "rewards/chosen": 0.4029712677001953, "rewards/margins": 0.20792235434055328, "rewards/rejected": 0.19504891335964203, "step": 267 }, { "epoch": 0.14, "learning_rate": 9.99903432568333e-08, "logits/chosen": -2.091207981109619, "logits/rejected": -2.091578245162964, "logps/chosen": -13.847692489624023, "logps/rejected": -6.548073768615723, "loss": 0.5736, "rewards/accuracies": 1.0, "rewards/chosen": 0.48976993560791016, "rewards/margins": 0.25540798902511597, "rewards/rejected": 0.234361931681633, "step": 268 }, { "epoch": 0.15, "learning_rate": 9.998990931408307e-08, "logits/chosen": -2.1119790077209473, "logits/rejected": -2.126814126968384, "logps/chosen": -11.923189163208008, "logps/rejected": -8.74669075012207, "loss": 0.5889, "rewards/accuracies": 1.0, "rewards/chosen": 0.49390849471092224, "rewards/margins": 0.22053220868110657, "rewards/rejected": 0.2733762860298157, "step": 269 }, { "epoch": 0.15, "learning_rate": 9.99894658354267e-08, "logits/chosen": -2.1491622924804688, "logits/rejected": -2.3954262733459473, "logps/chosen": -8.164840698242188, "logps/rejected": -8.060830116271973, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.30936184525489807, "rewards/margins": 0.007927507162094116, "rewards/rejected": 0.30143433809280396, "step": 270 }, { "epoch": 0.15, "learning_rate": 9.998901282094883e-08, "logits/chosen": -2.0014634132385254, "logits/rejected": -2.001460313796997, "logps/chosen": -7.082895278930664, "logps/rejected": -8.50556755065918, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.3190608024597168, "rewards/margins": -0.008069902658462524, "rewards/rejected": 0.3271307051181793, "step": 271 }, { "epoch": 0.15, "learning_rate": 9.998855027073584e-08, "logits/chosen": -2.09269642829895, "logits/rejected": -2.0935776233673096, "logps/chosen": -14.2736234664917, "logps/rejected": -8.989416122436523, "loss": 0.6015, "rewards/accuracies": 1.0, "rewards/chosen": 0.3793852925300598, "rewards/margins": 0.19256611168384552, "rewards/rejected": 0.1868191808462143, "step": 272 }, { "epoch": 0.15, "learning_rate": 9.998807818487598e-08, "logits/chosen": -2.164442539215088, "logits/rejected": -2.2786812782287598, "logps/chosen": -5.597317218780518, "logps/rejected": -5.7493462562561035, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.2775285243988037, "rewards/margins": 0.008694827556610107, "rewards/rejected": 0.2688336968421936, "step": 273 }, { "epoch": 0.15, "learning_rate": 9.99875965634593e-08, "logits/chosen": -2.049560308456421, "logits/rejected": -2.316601276397705, "logps/chosen": -5.073912620544434, "logps/rejected": -5.232333183288574, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.4613833427429199, "rewards/margins": 0.029885083436965942, "rewards/rejected": 0.431498259305954, "step": 274 }, { "epoch": 0.15, "learning_rate": 9.998710540657767e-08, "logits/chosen": -2.0625722408294678, "logits/rejected": -2.058258533477783, "logps/chosen": -16.01251792907715, "logps/rejected": -6.275405406951904, "loss": 0.537, "rewards/accuracies": 1.0, "rewards/chosen": 0.4631311595439911, "rewards/margins": 0.3413762152194977, "rewards/rejected": 0.12175493687391281, "step": 275 }, { "epoch": 0.15, "learning_rate": 9.998660471432479e-08, "logits/chosen": -2.160646915435791, "logits/rejected": -2.1633188724517822, "logps/chosen": -8.021866798400879, "logps/rejected": -8.405424118041992, "loss": 0.6151, "rewards/accuracies": 1.0, "rewards/chosen": 0.39893999695777893, "rewards/margins": 0.16272898018360138, "rewards/rejected": 0.23621101677417755, "step": 276 }, { "epoch": 0.15, "learning_rate": 9.998609448679615e-08, "logits/chosen": -2.0800905227661133, "logits/rejected": -2.270730495452881, "logps/chosen": -5.401823043823242, "logps/rejected": -5.194502830505371, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.3624567985534668, "rewards/margins": 0.015683263540267944, "rewards/rejected": 0.34677353501319885, "step": 277 }, { "epoch": 0.15, "learning_rate": 9.99855747240891e-08, "logits/chosen": -2.1158201694488525, "logits/rejected": -2.3616058826446533, "logps/chosen": -13.872809410095215, "logps/rejected": -14.007814407348633, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.3675927221775055, "rewards/margins": 0.018483251333236694, "rewards/rejected": 0.3491094708442688, "step": 278 }, { "epoch": 0.15, "learning_rate": 9.99850454263028e-08, "logits/chosen": -2.178318738937378, "logits/rejected": -2.25342059135437, "logps/chosen": -5.059470176696777, "logps/rejected": -5.050293445587158, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.39480048418045044, "rewards/margins": 0.023478269577026367, "rewards/rejected": 0.3713222146034241, "step": 279 }, { "epoch": 0.15, "learning_rate": 9.998450659353818e-08, "logits/chosen": -2.1167688369750977, "logits/rejected": -2.3404064178466797, "logps/chosen": -16.085739135742188, "logps/rejected": -13.393928527832031, "loss": 0.6467, "rewards/accuracies": 1.0, "rewards/chosen": 0.31479454040527344, "rewards/margins": 0.09517277777194977, "rewards/rejected": 0.21962176263332367, "step": 280 }, { "epoch": 0.15, "learning_rate": 9.998395822589805e-08, "logits/chosen": -2.223130226135254, "logits/rejected": -2.2819061279296875, "logps/chosen": -7.323631286621094, "logps/rejected": -7.084473609924316, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.2671348750591278, "rewards/margins": 0.004642128944396973, "rewards/rejected": 0.26249274611473083, "step": 281 }, { "epoch": 0.15, "learning_rate": 9.998340032348702e-08, "logits/chosen": -2.2172484397888184, "logits/rejected": -2.2110390663146973, "logps/chosen": -14.53367805480957, "logps/rejected": -7.766631126403809, "loss": 0.5752, "rewards/accuracies": 1.0, "rewards/chosen": 0.4686315655708313, "rewards/margins": 0.25159895420074463, "rewards/rejected": 0.21703262627124786, "step": 282 }, { "epoch": 0.15, "learning_rate": 9.99828328864115e-08, "logits/chosen": -2.066713571548462, "logits/rejected": -2.319633960723877, "logps/chosen": -12.38263988494873, "logps/rejected": -6.882437705993652, "loss": 0.7476, "rewards/accuracies": 0.0, "rewards/chosen": 0.2746885418891907, "rewards/margins": -0.10601967573165894, "rewards/rejected": 0.3807082176208496, "step": 283 }, { "epoch": 0.15, "learning_rate": 9.998225591477973e-08, "logits/chosen": -2.158919334411621, "logits/rejected": -2.358201742172241, "logps/chosen": -12.389151573181152, "logps/rejected": -19.139020919799805, "loss": 0.64, "rewards/accuracies": 1.0, "rewards/chosen": 0.24249926209449768, "rewards/margins": 0.10936880111694336, "rewards/rejected": 0.13313046097755432, "step": 284 }, { "epoch": 0.15, "learning_rate": 9.998166940870178e-08, "logits/chosen": -2.082913637161255, "logits/rejected": -2.277000904083252, "logps/chosen": -5.650321006774902, "logps/rejected": -5.5374274253845215, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": 0.2694033682346344, "rewards/margins": -0.015579372644424438, "rewards/rejected": 0.28498274087905884, "step": 285 }, { "epoch": 0.15, "learning_rate": 9.998107336828954e-08, "logits/chosen": -2.159731388092041, "logits/rejected": -2.1638295650482178, "logps/chosen": -7.306266784667969, "logps/rejected": -7.0966877937316895, "loss": 0.6066, "rewards/accuracies": 1.0, "rewards/chosen": 0.4308147430419922, "rewards/margins": 0.18139386177062988, "rewards/rejected": 0.2494208812713623, "step": 286 }, { "epoch": 0.15, "learning_rate": 9.998046779365668e-08, "logits/chosen": -2.069225788116455, "logits/rejected": -2.357633590698242, "logps/chosen": -5.750844478607178, "logps/rejected": -5.568005084991455, "loss": 0.6714, "rewards/accuracies": 1.0, "rewards/chosen": 0.4276943802833557, "rewards/margins": 0.04400348663330078, "rewards/rejected": 0.38369089365005493, "step": 287 }, { "epoch": 0.16, "learning_rate": 9.997985268491873e-08, "logits/chosen": -2.1545612812042236, "logits/rejected": -2.3309781551361084, "logps/chosen": -6.2083821296691895, "logps/rejected": -6.315848350524902, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.39891132712364197, "rewards/margins": 0.04260009527206421, "rewards/rejected": 0.35631123185157776, "step": 288 }, { "epoch": 0.16, "learning_rate": 9.997922804219305e-08, "logits/chosen": -2.2235817909240723, "logits/rejected": -2.108905076980591, "logps/chosen": -75.78160858154297, "logps/rejected": -9.735153198242188, "loss": 0.6337, "rewards/accuracies": 1.0, "rewards/chosen": 0.3131393492221832, "rewards/margins": 0.12265463173389435, "rewards/rejected": 0.19048471748828888, "step": 289 }, { "epoch": 0.16, "learning_rate": 9.997859386559875e-08, "logits/chosen": -2.151442050933838, "logits/rejected": -2.157247304916382, "logps/chosen": -26.920074462890625, "logps/rejected": -12.622961044311523, "loss": 0.4891, "rewards/accuracies": 1.0, "rewards/chosen": 0.7086154818534851, "rewards/margins": 0.4605853855609894, "rewards/rejected": 0.24803009629249573, "step": 290 }, { "epoch": 0.16, "learning_rate": 9.997795015525685e-08, "logits/chosen": -2.1828160285949707, "logits/rejected": -2.189765453338623, "logps/chosen": -6.979640007019043, "logps/rejected": -6.1038594245910645, "loss": 0.6245, "rewards/accuracies": 1.0, "rewards/chosen": 0.4878535270690918, "rewards/margins": 0.14234748482704163, "rewards/rejected": 0.34550604224205017, "step": 291 }, { "epoch": 0.16, "learning_rate": 9.997729691129011e-08, "logits/chosen": -2.085989236831665, "logits/rejected": -2.168757438659668, "logps/chosen": -11.39266586303711, "logps/rejected": -23.407588958740234, "loss": 0.5655, "rewards/accuracies": 1.0, "rewards/chosen": 0.4834609925746918, "rewards/margins": 0.2739967107772827, "rewards/rejected": 0.20946426689624786, "step": 292 }, { "epoch": 0.16, "learning_rate": 9.997663413382315e-08, "logits/chosen": -1.9857231378555298, "logits/rejected": -1.9797041416168213, "logps/chosen": -15.207049369812012, "logps/rejected": -6.348121643066406, "loss": 0.5592, "rewards/accuracies": 1.0, "rewards/chosen": 0.4614792764186859, "rewards/margins": 0.28874671459198, "rewards/rejected": 0.17273254692554474, "step": 293 }, { "epoch": 0.16, "learning_rate": 9.99759618229824e-08, "logits/chosen": -1.9813377857208252, "logits/rejected": -1.9817792177200317, "logps/chosen": -5.644295692443848, "logps/rejected": -4.979152679443359, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.46742841601371765, "rewards/margins": 0.030829429626464844, "rewards/rejected": 0.4365989863872528, "step": 294 }, { "epoch": 0.16, "learning_rate": 9.99752799788961e-08, "logits/chosen": -2.2342121601104736, "logits/rejected": -2.236283302307129, "logps/chosen": -7.990480422973633, "logps/rejected": -8.11867904663086, "loss": 0.5429, "rewards/accuracies": 1.0, "rewards/chosen": 0.5966281890869141, "rewards/margins": 0.3272066116333008, "rewards/rejected": 0.2694215774536133, "step": 295 }, { "epoch": 0.16, "learning_rate": 9.997458860169434e-08, "logits/chosen": -2.1932191848754883, "logits/rejected": -2.3269548416137695, "logps/chosen": -6.413542747497559, "logps/rejected": -6.363767623901367, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.37605124711990356, "rewards/margins": 0.008487612009048462, "rewards/rejected": 0.3675636351108551, "step": 296 }, { "epoch": 0.16, "learning_rate": 9.997388769150897e-08, "logits/chosen": -2.074430465698242, "logits/rejected": -2.0766849517822266, "logps/chosen": -5.212270736694336, "logps/rejected": -3.7366037368774414, "loss": 0.6409, "rewards/accuracies": 1.0, "rewards/chosen": 0.516422688961029, "rewards/margins": 0.10746058821678162, "rewards/rejected": 0.40896210074424744, "step": 297 }, { "epoch": 0.16, "learning_rate": 9.997317724847371e-08, "logits/chosen": -2.0531821250915527, "logits/rejected": -2.3267643451690674, "logps/chosen": -11.15345573425293, "logps/rejected": -10.778470039367676, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.21297912299633026, "rewards/margins": 0.0172882080078125, "rewards/rejected": 0.19569091498851776, "step": 298 }, { "epoch": 0.16, "learning_rate": 9.997245727272412e-08, "logits/chosen": -2.304080009460449, "logits/rejected": -2.0486462116241455, "logps/chosen": -96.85047912597656, "logps/rejected": -17.283205032348633, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.2900222837924957, "rewards/margins": 0.004183381795883179, "rewards/rejected": 0.28583890199661255, "step": 299 }, { "epoch": 0.16, "learning_rate": 9.997172776439746e-08, "logits/chosen": -2.039517402648926, "logits/rejected": -2.0392632484436035, "logps/chosen": -8.231796264648438, "logps/rejected": -5.189222812652588, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.513573944568634, "rewards/margins": 0.03430095314979553, "rewards/rejected": 0.4792729914188385, "step": 300 }, { "epoch": 0.16, "learning_rate": 9.997098872363295e-08, "logits/chosen": -2.007901906967163, "logits/rejected": -2.006070613861084, "logps/chosen": -9.012090682983398, "logps/rejected": -9.35210132598877, "loss": 0.5999, "rewards/accuracies": 1.0, "rewards/chosen": 0.5469293594360352, "rewards/margins": 0.19601449370384216, "rewards/rejected": 0.350914865732193, "step": 301 }, { "epoch": 0.16, "learning_rate": 9.997024015057155e-08, "logits/chosen": -2.1167426109313965, "logits/rejected": -2.273815631866455, "logps/chosen": -5.095648288726807, "logps/rejected": -11.368555068969727, "loss": 0.6528, "rewards/accuracies": 1.0, "rewards/chosen": 0.4125278890132904, "rewards/margins": 0.0824451744556427, "rewards/rejected": 0.3300827145576477, "step": 302 }, { "epoch": 0.16, "learning_rate": 9.996948204535605e-08, "logits/chosen": -1.995681643486023, "logits/rejected": -2.0045270919799805, "logps/chosen": -8.474050521850586, "logps/rejected": -9.371011734008789, "loss": 0.618, "rewards/accuracies": 1.0, "rewards/chosen": 0.513133704662323, "rewards/margins": 0.15635231137275696, "rewards/rejected": 0.35678139328956604, "step": 303 }, { "epoch": 0.16, "learning_rate": 9.996871440813107e-08, "logits/chosen": -2.232762336730957, "logits/rejected": -2.101919412612915, "logps/chosen": -50.730506896972656, "logps/rejected": -12.883939743041992, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 0.32173576951026917, "rewards/margins": -0.021721065044403076, "rewards/rejected": 0.34345683455467224, "step": 304 }, { "epoch": 0.16, "learning_rate": 9.996793723904304e-08, "logits/chosen": -2.0759949684143066, "logits/rejected": -2.0730106830596924, "logps/chosen": -17.69005584716797, "logps/rejected": -6.6165947914123535, "loss": 0.6595, "rewards/accuracies": 1.0, "rewards/chosen": 0.38281136751174927, "rewards/margins": 0.06854549050331116, "rewards/rejected": 0.3142658770084381, "step": 305 }, { "epoch": 0.17, "learning_rate": 9.99671505382402e-08, "logits/chosen": -2.140286445617676, "logits/rejected": -2.308469295501709, "logps/chosen": -8.69125747680664, "logps/rejected": -6.4845380783081055, "loss": 0.7254, "rewards/accuracies": 0.0, "rewards/chosen": 0.2226140946149826, "rewards/margins": -0.0634838193655014, "rewards/rejected": 0.286097913980484, "step": 306 }, { "epoch": 0.17, "learning_rate": 9.996635430587265e-08, "logits/chosen": -2.2249786853790283, "logits/rejected": -2.393651008605957, "logps/chosen": -5.711596965789795, "logps/rejected": -5.527539253234863, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.4013002812862396, "rewards/margins": -0.0076379477977752686, "rewards/rejected": 0.4089382290840149, "step": 307 }, { "epoch": 0.17, "learning_rate": 9.996554854209224e-08, "logits/chosen": -2.093395233154297, "logits/rejected": -2.289867639541626, "logps/chosen": -5.5446271896362305, "logps/rejected": -5.514730930328369, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.4666427671909332, "rewards/margins": -0.003861755132675171, "rewards/rejected": 0.4705045223236084, "step": 308 }, { "epoch": 0.17, "learning_rate": 9.996473324705269e-08, "logits/chosen": -2.1480751037597656, "logits/rejected": -2.160641670227051, "logps/chosen": -9.026450157165527, "logps/rejected": -9.354069709777832, "loss": 0.5331, "rewards/accuracies": 1.0, "rewards/chosen": 0.7707427144050598, "rewards/margins": 0.35079213976860046, "rewards/rejected": 0.41995057463645935, "step": 309 }, { "epoch": 0.17, "learning_rate": 9.996390842090952e-08, "logits/chosen": -2.035489320755005, "logits/rejected": -2.035681962966919, "logps/chosen": -6.432090759277344, "logps/rejected": -6.401393890380859, "loss": 0.6161, "rewards/accuracies": 1.0, "rewards/chosen": 0.5826517343521118, "rewards/margins": 0.16045162081718445, "rewards/rejected": 0.42220011353492737, "step": 310 }, { "epoch": 0.17, "learning_rate": 9.996307406382008e-08, "logits/chosen": -2.1933717727661133, "logits/rejected": -2.301452159881592, "logps/chosen": -5.80235481262207, "logps/rejected": -5.687054634094238, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 0.4585416913032532, "rewards/margins": -0.018514543771743774, "rewards/rejected": 0.47705623507499695, "step": 311 }, { "epoch": 0.17, "learning_rate": 9.996223017594352e-08, "logits/chosen": -2.149115562438965, "logits/rejected": -2.1526169776916504, "logps/chosen": -5.636733055114746, "logps/rejected": -6.152950286865234, "loss": 0.5673, "rewards/accuracies": 1.0, "rewards/chosen": 0.5875675082206726, "rewards/margins": 0.2697751820087433, "rewards/rejected": 0.3177923262119293, "step": 312 }, { "epoch": 0.17, "learning_rate": 9.996137675744082e-08, "logits/chosen": -2.284937858581543, "logits/rejected": -2.460805892944336, "logps/chosen": -5.093149185180664, "logps/rejected": -4.912595748901367, "loss": 0.7023, "rewards/accuracies": 0.0, "rewards/chosen": 0.40678873658180237, "rewards/margins": -0.018222898244857788, "rewards/rejected": 0.42501163482666016, "step": 313 }, { "epoch": 0.17, "learning_rate": 9.996051380847477e-08, "logits/chosen": -2.141136646270752, "logits/rejected": -2.311836004257202, "logps/chosen": -6.419684410095215, "logps/rejected": -4.8653082847595215, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.4555845260620117, "rewards/margins": 0.0165766179561615, "rewards/rejected": 0.4390079081058502, "step": 314 }, { "epoch": 0.17, "learning_rate": 9.995964132921e-08, "logits/chosen": -2.192791223526001, "logits/rejected": -2.092252016067505, "logps/chosen": -40.373897552490234, "logps/rejected": -5.345836639404297, "loss": 0.5883, "rewards/accuracies": 1.0, "rewards/chosen": 0.5401714444160461, "rewards/margins": 0.22195550799369812, "rewards/rejected": 0.318215936422348, "step": 315 }, { "epoch": 0.17, "learning_rate": 9.995875931981292e-08, "logits/chosen": -2.0894479751586914, "logits/rejected": -2.295440435409546, "logps/chosen": -6.184067726135254, "logps/rejected": -9.368955612182617, "loss": 0.646, "rewards/accuracies": 1.0, "rewards/chosen": 0.5807310342788696, "rewards/margins": 0.09654000401496887, "rewards/rejected": 0.48419103026390076, "step": 316 }, { "epoch": 0.17, "learning_rate": 9.995786778045178e-08, "logits/chosen": -2.1772398948669434, "logits/rejected": -2.1629109382629395, "logps/chosen": -20.381359100341797, "logps/rejected": -6.44844913482666, "loss": 0.6023, "rewards/accuracies": 1.0, "rewards/chosen": 0.5380887985229492, "rewards/margins": 0.1907137930393219, "rewards/rejected": 0.3473750054836273, "step": 317 }, { "epoch": 0.17, "learning_rate": 9.995696671129668e-08, "logits/chosen": -2.2293262481689453, "logits/rejected": -2.2336134910583496, "logps/chosen": -8.079490661621094, "logps/rejected": -4.953893661499023, "loss": 0.6246, "rewards/accuracies": 1.0, "rewards/chosen": 0.6741012930870056, "rewards/margins": 0.1420522928237915, "rewards/rejected": 0.5320490002632141, "step": 318 }, { "epoch": 0.17, "learning_rate": 9.995605611251948e-08, "logits/chosen": -2.1458897590637207, "logits/rejected": -2.3996422290802, "logps/chosen": -3.632479190826416, "logps/rejected": -3.4727673530578613, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.5611423254013062, "rewards/margins": -0.0057533979415893555, "rewards/rejected": 0.5668957233428955, "step": 319 }, { "epoch": 0.17, "learning_rate": 9.995513598429385e-08, "logits/chosen": -2.2016031742095947, "logits/rejected": -2.1615869998931885, "logps/chosen": -27.84693717956543, "logps/rejected": -5.981766223907471, "loss": 0.5948, "rewards/accuracies": 1.0, "rewards/chosen": 0.5446611642837524, "rewards/margins": 0.2074103057384491, "rewards/rejected": 0.33725085854530334, "step": 320 }, { "epoch": 0.17, "learning_rate": 9.995420632679536e-08, "logits/chosen": -2.17590069770813, "logits/rejected": -2.1831843852996826, "logps/chosen": -7.494418621063232, "logps/rejected": -12.074263572692871, "loss": 0.5588, "rewards/accuracies": 1.0, "rewards/chosen": 0.5611676573753357, "rewards/margins": 0.2895958125591278, "rewards/rejected": 0.2715718448162079, "step": 321 }, { "epoch": 0.17, "learning_rate": 9.995326714020134e-08, "logits/chosen": -2.0895400047302246, "logits/rejected": -2.326443910598755, "logps/chosen": -3.617295742034912, "logps/rejected": -3.630131721496582, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.5692917704582214, "rewards/margins": 0.026673078536987305, "rewards/rejected": 0.5426186919212341, "step": 322 }, { "epoch": 0.17, "learning_rate": 9.995231842469093e-08, "logits/chosen": -1.9980443716049194, "logits/rejected": -1.9976283311843872, "logps/chosen": -4.250730514526367, "logps/rejected": -4.677827835083008, "loss": 0.6758, "rewards/accuracies": 1.0, "rewards/chosen": 0.607369065284729, "rewards/margins": 0.035094261169433594, "rewards/rejected": 0.5722748041152954, "step": 323 }, { "epoch": 0.17, "learning_rate": 9.995136018044512e-08, "logits/chosen": -2.154649496078491, "logits/rejected": -2.320871114730835, "logps/chosen": -4.12216854095459, "logps/rejected": -4.130582809448242, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.4768950641155243, "rewards/margins": 0.023935705423355103, "rewards/rejected": 0.4529593586921692, "step": 324 }, { "epoch": 0.18, "learning_rate": 9.995039240764669e-08, "logits/chosen": -2.126695156097412, "logits/rejected": -2.1129977703094482, "logps/chosen": -24.35057258605957, "logps/rejected": -4.562667369842529, "loss": 0.4889, "rewards/accuracies": 1.0, "rewards/chosen": 0.8105798959732056, "rewards/margins": 0.4612410366535187, "rewards/rejected": 0.3493388593196869, "step": 325 }, { "epoch": 0.18, "learning_rate": 9.994941510648025e-08, "logits/chosen": -2.079249620437622, "logits/rejected": -2.2984836101531982, "logps/chosen": -4.3275556564331055, "logps/rejected": -4.199099540710449, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.44881144165992737, "rewards/margins": 0.006537258625030518, "rewards/rejected": 0.44227418303489685, "step": 326 }, { "epoch": 0.18, "learning_rate": 9.994842827713223e-08, "logits/chosen": -2.1803247928619385, "logits/rejected": -2.304046154022217, "logps/chosen": -6.986059188842773, "logps/rejected": -7.140329360961914, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.46738386154174805, "rewards/margins": -0.00792190432548523, "rewards/rejected": 0.4753057658672333, "step": 327 }, { "epoch": 0.18, "learning_rate": 9.994743191979088e-08, "logits/chosen": -2.1023576259613037, "logits/rejected": -2.111717462539673, "logps/chosen": -6.517841339111328, "logps/rejected": -5.720330238342285, "loss": 0.5316, "rewards/accuracies": 1.0, "rewards/chosen": 0.6970362067222595, "rewards/margins": 0.35433629155158997, "rewards/rejected": 0.34269991517066956, "step": 328 }, { "epoch": 0.18, "learning_rate": 9.994642603464625e-08, "logits/chosen": -2.046968460083008, "logits/rejected": -2.086920738220215, "logps/chosen": -4.992409706115723, "logps/rejected": -19.15059471130371, "loss": 0.6421, "rewards/accuracies": 1.0, "rewards/chosen": 0.3827202022075653, "rewards/margins": 0.10494500398635864, "rewards/rejected": 0.27777519822120667, "step": 329 }, { "epoch": 0.18, "learning_rate": 9.994541062189023e-08, "logits/chosen": -2.1029398441314697, "logits/rejected": -2.103391408920288, "logps/chosen": -14.16975212097168, "logps/rejected": -5.856206893920898, "loss": 0.597, "rewards/accuracies": 1.0, "rewards/chosen": 0.656911313533783, "rewards/margins": 0.2025785744190216, "rewards/rejected": 0.45433273911476135, "step": 330 }, { "epoch": 0.18, "learning_rate": 9.99443856817165e-08, "logits/chosen": -2.0228710174560547, "logits/rejected": -2.2866530418395996, "logps/chosen": -5.066980361938477, "logps/rejected": -4.8262038230896, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.5448280572891235, "rewards/margins": 0.019056499004364014, "rewards/rejected": 0.5257715582847595, "step": 331 }, { "epoch": 0.18, "learning_rate": 9.994335121432059e-08, "logits/chosen": -1.9888343811035156, "logits/rejected": -2.295816659927368, "logps/chosen": -15.187311172485352, "logps/rejected": -5.715184211730957, "loss": 0.7408, "rewards/accuracies": 0.0, "rewards/chosen": 0.37818795442581177, "rewards/margins": -0.09310340881347656, "rewards/rejected": 0.47129136323928833, "step": 332 }, { "epoch": 0.18, "learning_rate": 9.994230721989982e-08, "logits/chosen": -2.138420820236206, "logits/rejected": -2.1417434215545654, "logps/chosen": -10.807579040527344, "logps/rejected": -9.987586975097656, "loss": 0.6155, "rewards/accuracies": 1.0, "rewards/chosen": 0.5415008664131165, "rewards/margins": 0.16190433502197266, "rewards/rejected": 0.3795965313911438, "step": 333 }, { "epoch": 0.18, "learning_rate": 9.994125369865335e-08, "logits/chosen": -2.04403018951416, "logits/rejected": -2.0537588596343994, "logps/chosen": -6.214930534362793, "logps/rejected": -4.4598846435546875, "loss": 0.5558, "rewards/accuracies": 1.0, "rewards/chosen": 0.7170554399490356, "rewards/margins": 0.2966283857822418, "rewards/rejected": 0.4204270541667938, "step": 334 }, { "epoch": 0.18, "learning_rate": 9.994019065078216e-08, "logits/chosen": -2.1186797618865967, "logits/rejected": -2.1197125911712646, "logps/chosen": -3.60227370262146, "logps/rejected": -8.434097290039062, "loss": 0.5668, "rewards/accuracies": 1.0, "rewards/chosen": 0.6972455382347107, "rewards/margins": 0.27090322971343994, "rewards/rejected": 0.42634230852127075, "step": 335 }, { "epoch": 0.18, "learning_rate": 9.993911807648899e-08, "logits/chosen": -2.11940336227417, "logits/rejected": -2.318514347076416, "logps/chosen": -4.298962116241455, "logps/rejected": -4.081375598907471, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.479584664106369, "rewards/margins": 0.019192516803741455, "rewards/rejected": 0.46039214730262756, "step": 336 }, { "epoch": 0.18, "learning_rate": 9.993803597597848e-08, "logits/chosen": -2.1486027240753174, "logits/rejected": -2.1129114627838135, "logps/chosen": -28.656360626220703, "logps/rejected": -11.32725715637207, "loss": 0.6162, "rewards/accuracies": 1.0, "rewards/chosen": 0.6361278891563416, "rewards/margins": 0.16034147143363953, "rewards/rejected": 0.475786417722702, "step": 337 }, { "epoch": 0.18, "learning_rate": 9.993694434945703e-08, "logits/chosen": -2.05950927734375, "logits/rejected": -2.0609521865844727, "logps/chosen": -3.8030972480773926, "logps/rejected": -5.4553632736206055, "loss": 0.5938, "rewards/accuracies": 1.0, "rewards/chosen": 0.5082550644874573, "rewards/margins": 0.20974108576774597, "rewards/rejected": 0.2985139787197113, "step": 338 }, { "epoch": 0.18, "learning_rate": 9.993584319713288e-08, "logits/chosen": -2.1223855018615723, "logits/rejected": -2.320434093475342, "logps/chosen": -3.5097427368164062, "logps/rejected": -3.4684829711914062, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.5568187832832336, "rewards/margins": 0.004227936267852783, "rewards/rejected": 0.5525908470153809, "step": 339 }, { "epoch": 0.18, "learning_rate": 9.993473251921606e-08, "logits/chosen": -2.1333189010620117, "logits/rejected": -2.368195056915283, "logps/chosen": -3.332646131515503, "logps/rejected": -3.276839017868042, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.5565047264099121, "rewards/margins": -0.0016123652458190918, "rewards/rejected": 0.5581170916557312, "step": 340 }, { "epoch": 0.18, "learning_rate": 9.99336123159185e-08, "logits/chosen": -2.186138868331909, "logits/rejected": -2.1901135444641113, "logps/chosen": -6.151525020599365, "logps/rejected": -4.900528907775879, "loss": 0.4712, "rewards/accuracies": 1.0, "rewards/chosen": 0.8956460952758789, "rewards/margins": 0.5076436996459961, "rewards/rejected": 0.3880023956298828, "step": 341 }, { "epoch": 0.18, "learning_rate": 9.993248258745381e-08, "logits/chosen": -2.1580193042755127, "logits/rejected": -2.158052921295166, "logps/chosen": -6.087070465087891, "logps/rejected": -3.9878101348876953, "loss": 0.5466, "rewards/accuracies": 1.0, "rewards/chosen": 0.7339966893196106, "rewards/margins": 0.318315714597702, "rewards/rejected": 0.41568097472190857, "step": 342 }, { "epoch": 0.19, "learning_rate": 9.993134333403754e-08, "logits/chosen": -2.05968976020813, "logits/rejected": -2.058136224746704, "logps/chosen": -3.6993041038513184, "logps/rejected": -4.875079154968262, "loss": 0.5983, "rewards/accuracies": 1.0, "rewards/chosen": 0.6300559639930725, "rewards/margins": 0.1996656060218811, "rewards/rejected": 0.4303903579711914, "step": 343 }, { "epoch": 0.19, "learning_rate": 9.993019455588701e-08, "logits/chosen": -2.0671420097351074, "logits/rejected": -2.272334337234497, "logps/chosen": -10.540410995483398, "logps/rejected": -8.643821716308594, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.48906680941581726, "rewards/margins": -0.008716106414794922, "rewards/rejected": 0.4977829158306122, "step": 344 }, { "epoch": 0.19, "learning_rate": 9.992903625322134e-08, "logits/chosen": -2.115753650665283, "logits/rejected": -2.3324148654937744, "logps/chosen": -5.987792491912842, "logps/rejected": -7.012900352478027, "loss": 0.6625, "rewards/accuracies": 1.0, "rewards/chosen": 0.5648949146270752, "rewards/margins": 0.06222301721572876, "rewards/rejected": 0.5026718974113464, "step": 345 }, { "epoch": 0.19, "learning_rate": 9.992786842626149e-08, "logits/chosen": -2.0279524326324463, "logits/rejected": -2.239302158355713, "logps/chosen": -2.9055991172790527, "logps/rejected": -2.9380393028259277, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.6155002117156982, "rewards/margins": 0.005635261535644531, "rewards/rejected": 0.6098649501800537, "step": 346 }, { "epoch": 0.19, "learning_rate": 9.992669107523023e-08, "logits/chosen": -2.0258421897888184, "logits/rejected": -2.026589870452881, "logps/chosen": -4.333252429962158, "logps/rejected": -5.72191858291626, "loss": 0.5542, "rewards/accuracies": 1.0, "rewards/chosen": 0.6627355217933655, "rewards/margins": 0.30032673478126526, "rewards/rejected": 0.3624087870121002, "step": 347 }, { "epoch": 0.19, "learning_rate": 9.992550420035215e-08, "logits/chosen": -2.1654200553894043, "logits/rejected": -2.3050661087036133, "logps/chosen": -4.352804183959961, "logps/rejected": -4.339906692504883, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.5488972663879395, "rewards/margins": 0.0032317042350769043, "rewards/rejected": 0.5456655621528625, "step": 348 }, { "epoch": 0.19, "learning_rate": 9.992430780185366e-08, "logits/chosen": -2.1573486328125, "logits/rejected": -2.359443187713623, "logps/chosen": -4.073700428009033, "logps/rejected": -15.794179916381836, "loss": 0.565, "rewards/accuracies": 1.0, "rewards/chosen": 0.7105129957199097, "rewards/margins": 0.27525070309638977, "rewards/rejected": 0.4352622926235199, "step": 349 }, { "epoch": 0.19, "learning_rate": 9.992310187996295e-08, "logits/chosen": -2.2314491271972656, "logits/rejected": -2.3118979930877686, "logps/chosen": -4.306148529052734, "logps/rejected": -4.307916164398193, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.6154074668884277, "rewards/margins": 0.00969916582107544, "rewards/rejected": 0.6057083010673523, "step": 350 }, { "epoch": 0.19, "learning_rate": 9.992188643491011e-08, "logits/chosen": -2.0369906425476074, "logits/rejected": -2.3143622875213623, "logps/chosen": -4.867038249969482, "logps/rejected": -4.96198844909668, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.5537416338920593, "rewards/margins": 0.009067535400390625, "rewards/rejected": 0.5446740984916687, "step": 351 }, { "epoch": 0.19, "learning_rate": 9.992066146692695e-08, "logits/chosen": -2.0793356895446777, "logits/rejected": -2.0784900188446045, "logps/chosen": -2.856945514678955, "logps/rejected": -4.036101341247559, "loss": 0.5728, "rewards/accuracies": 1.0, "rewards/chosen": 0.72920823097229, "rewards/margins": 0.2572469115257263, "rewards/rejected": 0.4719613194465637, "step": 352 }, { "epoch": 0.19, "learning_rate": 9.991942697624715e-08, "logits/chosen": -2.077547073364258, "logits/rejected": -2.2809031009674072, "logps/chosen": -3.034191370010376, "logps/rejected": -2.6659836769104004, "loss": 0.7064, "rewards/accuracies": 0.0, "rewards/chosen": 0.5458311438560486, "rewards/margins": -0.026309967041015625, "rewards/rejected": 0.5721411108970642, "step": 353 }, { "epoch": 0.19, "learning_rate": 9.991818296310621e-08, "logits/chosen": -2.1141774654388428, "logits/rejected": -2.115518808364868, "logps/chosen": -4.8627777099609375, "logps/rejected": -4.9599151611328125, "loss": 0.6619, "rewards/accuracies": 1.0, "rewards/chosen": 0.6743996739387512, "rewards/margins": 0.0635758638381958, "rewards/rejected": 0.6108238101005554, "step": 354 }, { "epoch": 0.19, "learning_rate": 9.991692942774144e-08, "logits/chosen": -2.1559059619903564, "logits/rejected": -2.161938190460205, "logps/chosen": -6.944562911987305, "logps/rejected": -6.639544486999512, "loss": 0.4556, "rewards/accuracies": 1.0, "rewards/chosen": 0.9486803412437439, "rewards/margins": 0.5496594905853271, "rewards/rejected": 0.39902088046073914, "step": 355 }, { "epoch": 0.19, "learning_rate": 9.991566637039193e-08, "logits/chosen": -2.150181531906128, "logits/rejected": -2.2583298683166504, "logps/chosen": -5.106649398803711, "logps/rejected": -4.585934162139893, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.6668030023574829, "rewards/margins": -0.007404625415802002, "rewards/rejected": 0.6742076277732849, "step": 356 }, { "epoch": 0.19, "learning_rate": 9.991439379129863e-08, "logits/chosen": -2.0674238204956055, "logits/rejected": -2.323831558227539, "logps/chosen": -3.168302059173584, "logps/rejected": -2.9414050579071045, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 0.5735663175582886, "rewards/margins": -0.014177441596984863, "rewards/rejected": 0.5877437591552734, "step": 357 }, { "epoch": 0.19, "learning_rate": 9.991311169070432e-08, "logits/chosen": -2.1780025959014893, "logits/rejected": -2.1775877475738525, "logps/chosen": -5.404778003692627, "logps/rejected": -5.443206310272217, "loss": 0.5171, "rewards/accuracies": 1.0, "rewards/chosen": 0.6260916590690613, "rewards/margins": 0.3897451162338257, "rewards/rejected": 0.2363465279340744, "step": 358 }, { "epoch": 0.19, "learning_rate": 9.991182006885352e-08, "logits/chosen": -2.133408546447754, "logits/rejected": -2.1300694942474365, "logps/chosen": -15.117475509643555, "logps/rejected": -4.839517593383789, "loss": 0.6571, "rewards/accuracies": 1.0, "rewards/chosen": 0.5135374069213867, "rewards/margins": 0.07335585355758667, "rewards/rejected": 0.44018155336380005, "step": 359 }, { "epoch": 0.19, "learning_rate": 9.991051892599266e-08, "logits/chosen": -2.0738677978515625, "logits/rejected": -2.262834072113037, "logps/chosen": -3.2875919342041016, "logps/rejected": -3.430424451828003, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.5317204594612122, "rewards/margins": 0.01848667860031128, "rewards/rejected": 0.5132337808609009, "step": 360 }, { "epoch": 0.19, "learning_rate": 9.990920826236991e-08, "logits/chosen": -2.0903337001800537, "logits/rejected": -2.088552951812744, "logps/chosen": -15.185550689697266, "logps/rejected": -3.3544936180114746, "loss": 0.5674, "rewards/accuracies": 1.0, "rewards/chosen": 0.8918846249580383, "rewards/margins": 0.2697180509567261, "rewards/rejected": 0.6221665740013123, "step": 361 }, { "epoch": 0.2, "learning_rate": 9.990788807823531e-08, "logits/chosen": -2.0286009311676025, "logits/rejected": -2.0249149799346924, "logps/chosen": -5.513211250305176, "logps/rejected": -7.18347692489624, "loss": 0.5037, "rewards/accuracies": 1.0, "rewards/chosen": 0.6986384391784668, "rewards/margins": 0.4233974814414978, "rewards/rejected": 0.275240957736969, "step": 362 }, { "epoch": 0.2, "learning_rate": 9.990655837384069e-08, "logits/chosen": -1.9664756059646606, "logits/rejected": -1.9681512117385864, "logps/chosen": -4.990336894989014, "logps/rejected": -6.845279216766357, "loss": 0.6202, "rewards/accuracies": 1.0, "rewards/chosen": 0.7205405831336975, "rewards/margins": 0.15164291858673096, "rewards/rejected": 0.5688976645469666, "step": 363 }, { "epoch": 0.2, "learning_rate": 9.990521914943967e-08, "logits/chosen": -2.1203062534332275, "logits/rejected": -2.1206164360046387, "logps/chosen": -6.289493083953857, "logps/rejected": -4.65313720703125, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": 0.6229529976844788, "rewards/margins": -0.0062416791915893555, "rewards/rejected": 0.6291946768760681, "step": 364 }, { "epoch": 0.2, "learning_rate": 9.990387040528777e-08, "logits/chosen": -2.185560941696167, "logits/rejected": -2.213343858718872, "logps/chosen": -13.452661514282227, "logps/rejected": -21.20269775390625, "loss": 0.4172, "rewards/accuracies": 1.0, "rewards/chosen": 0.7963825464248657, "rewards/margins": 0.6584701538085938, "rewards/rejected": 0.13791237771511078, "step": 365 }, { "epoch": 0.2, "learning_rate": 9.990251214164222e-08, "logits/chosen": -2.0443027019500732, "logits/rejected": -2.2361934185028076, "logps/chosen": -8.718101501464844, "logps/rejected": -8.788177490234375, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.6121488809585571, "rewards/margins": 0.0035297274589538574, "rewards/rejected": 0.6086191534996033, "step": 366 }, { "epoch": 0.2, "learning_rate": 9.990114435876216e-08, "logits/chosen": -2.015159845352173, "logits/rejected": -2.0131235122680664, "logps/chosen": -2.9911608695983887, "logps/rejected": -4.517581462860107, "loss": 0.5574, "rewards/accuracies": 1.0, "rewards/chosen": 0.7810828685760498, "rewards/margins": 0.29281529784202576, "rewards/rejected": 0.48826757073402405, "step": 367 }, { "epoch": 0.2, "learning_rate": 9.989976705690848e-08, "logits/chosen": -2.0527632236480713, "logits/rejected": -2.2902865409851074, "logps/chosen": -1.9139323234558105, "logps/rejected": -1.9061768054962158, "loss": 0.6714, "rewards/accuracies": 1.0, "rewards/chosen": 0.8586794137954712, "rewards/margins": 0.04390996694564819, "rewards/rejected": 0.814769446849823, "step": 368 }, { "epoch": 0.2, "learning_rate": 9.98983802363439e-08, "logits/chosen": -2.127373218536377, "logits/rejected": -2.12583065032959, "logps/chosen": -4.7501912117004395, "logps/rejected": -6.238818168640137, "loss": 0.5066, "rewards/accuracies": 1.0, "rewards/chosen": 0.7931590676307678, "rewards/margins": 0.41596150398254395, "rewards/rejected": 0.3771975636482239, "step": 369 }, { "epoch": 0.2, "learning_rate": 9.989698389733298e-08, "logits/chosen": -1.9831501245498657, "logits/rejected": -2.2803943157196045, "logps/chosen": -3.7669568061828613, "logps/rejected": -3.9492549896240234, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.5367978811264038, "rewards/margins": 0.010602355003356934, "rewards/rejected": 0.5261955261230469, "step": 370 }, { "epoch": 0.2, "learning_rate": 9.98955780401421e-08, "logits/chosen": -2.1345057487487793, "logits/rejected": -2.1145339012145996, "logps/chosen": -28.37004852294922, "logps/rejected": -5.434370994567871, "loss": 0.5061, "rewards/accuracies": 1.0, "rewards/chosen": 0.9727516174316406, "rewards/margins": 0.41722631454467773, "rewards/rejected": 0.5555253028869629, "step": 371 }, { "epoch": 0.2, "learning_rate": 9.989416266503941e-08, "logits/chosen": -2.0883095264434814, "logits/rejected": -2.0881261825561523, "logps/chosen": -4.215726375579834, "logps/rejected": -3.6028623580932617, "loss": 0.5703, "rewards/accuracies": 1.0, "rewards/chosen": 0.8170284628868103, "rewards/margins": 0.26283353567123413, "rewards/rejected": 0.5541949272155762, "step": 372 }, { "epoch": 0.2, "learning_rate": 9.98927377722949e-08, "logits/chosen": -2.084728479385376, "logits/rejected": -2.2645223140716553, "logps/chosen": -4.772953987121582, "logps/rejected": -4.69513463973999, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.7561529278755188, "rewards/margins": 0.008706629276275635, "rewards/rejected": 0.7474462985992432, "step": 373 }, { "epoch": 0.2, "learning_rate": 9.989130336218037e-08, "logits/chosen": -2.091249465942383, "logits/rejected": -2.0959632396698, "logps/chosen": -5.283572196960449, "logps/rejected": -4.72831916809082, "loss": 0.7079, "rewards/accuracies": 0.0, "rewards/chosen": 0.6452534794807434, "rewards/margins": -0.02938520908355713, "rewards/rejected": 0.6746386885643005, "step": 374 }, { "epoch": 0.2, "learning_rate": 9.988985943496948e-08, "logits/chosen": -2.109050750732422, "logits/rejected": -2.108605146408081, "logps/chosen": -4.323441028594971, "logps/rejected": -11.726859092712402, "loss": 0.6028, "rewards/accuracies": 1.0, "rewards/chosen": 0.7343523502349854, "rewards/margins": 0.18975406885147095, "rewards/rejected": 0.5445982813835144, "step": 375 }, { "epoch": 0.2, "learning_rate": 9.988840599093764e-08, "logits/chosen": -1.9370861053466797, "logits/rejected": -2.2854204177856445, "logps/chosen": -7.704556465148926, "logps/rejected": -9.456378936767578, "loss": 0.5332, "rewards/accuracies": 1.0, "rewards/chosen": 0.638825535774231, "rewards/margins": 0.3504655957221985, "rewards/rejected": 0.28835994005203247, "step": 376 }, { "epoch": 0.2, "learning_rate": 9.988694303036211e-08, "logits/chosen": -2.125800132751465, "logits/rejected": -2.3091442584991455, "logps/chosen": -14.219507217407227, "logps/rejected": -13.416189193725586, "loss": 0.7092, "rewards/accuracies": 0.0, "rewards/chosen": 0.6245366930961609, "rewards/margins": -0.03194773197174072, "rewards/rejected": 0.6564844250679016, "step": 377 }, { "epoch": 0.2, "learning_rate": 9.988547055352197e-08, "logits/chosen": -2.0479655265808105, "logits/rejected": -2.0565569400787354, "logps/chosen": -4.55950927734375, "logps/rejected": -3.634368419647217, "loss": 0.57, "rewards/accuracies": 1.0, "rewards/chosen": 0.7729964256286621, "rewards/margins": 0.26369887590408325, "rewards/rejected": 0.5092975497245789, "step": 378 }, { "epoch": 0.2, "learning_rate": 9.988398856069808e-08, "logits/chosen": -2.0984396934509277, "logits/rejected": -2.073183536529541, "logps/chosen": -13.147397994995117, "logps/rejected": -6.064966201782227, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.48369351029396057, "rewards/margins": 0.030614852905273438, "rewards/rejected": 0.45307865738868713, "step": 379 }, { "epoch": 0.2, "learning_rate": 9.988249705217318e-08, "logits/chosen": -2.171438455581665, "logits/rejected": -2.154116630554199, "logps/chosen": -33.88790512084961, "logps/rejected": -17.606233596801758, "loss": 0.6513, "rewards/accuracies": 1.0, "rewards/chosen": 0.7103984951972961, "rewards/margins": 0.08544862270355225, "rewards/rejected": 0.6249498724937439, "step": 380 }, { "epoch": 0.21, "learning_rate": 9.988099602823175e-08, "logits/chosen": -2.1218957901000977, "logits/rejected": -2.34297513961792, "logps/chosen": -3.6531190872192383, "logps/rejected": -3.6116435527801514, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.9260827302932739, "rewards/margins": 0.024685025215148926, "rewards/rejected": 0.901397705078125, "step": 381 }, { "epoch": 0.21, "learning_rate": 9.987948548916011e-08, "logits/chosen": -2.1290090084075928, "logits/rejected": -2.230570077896118, "logps/chosen": -17.82169532775879, "logps/rejected": -23.67096710205078, "loss": 0.4429, "rewards/accuracies": 1.0, "rewards/chosen": 1.0067640542984009, "rewards/margins": 0.5848158001899719, "rewards/rejected": 0.42194825410842896, "step": 382 }, { "epoch": 0.21, "learning_rate": 9.987796543524645e-08, "logits/chosen": -2.1018733978271484, "logits/rejected": -2.4081027507781982, "logps/chosen": -24.048402786254883, "logps/rejected": -23.187660217285156, "loss": 0.7066, "rewards/accuracies": 0.0, "rewards/chosen": 0.3534338176250458, "rewards/margins": -0.026749223470687866, "rewards/rejected": 0.38018304109573364, "step": 383 }, { "epoch": 0.21, "learning_rate": 9.98764358667807e-08, "logits/chosen": -2.1136856079101562, "logits/rejected": -2.3064022064208984, "logps/chosen": -2.6196484565734863, "logps/rejected": -2.509519577026367, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.7212204337120056, "rewards/margins": 0.021908223628997803, "rewards/rejected": 0.6993122100830078, "step": 384 }, { "epoch": 0.21, "learning_rate": 9.987489678405465e-08, "logits/chosen": -2.0386037826538086, "logits/rejected": -2.023683547973633, "logps/chosen": -13.003220558166504, "logps/rejected": -12.831991195678711, "loss": 0.5446, "rewards/accuracies": 1.0, "rewards/chosen": 0.8949223756790161, "rewards/margins": 0.3231698274612427, "rewards/rejected": 0.5717525482177734, "step": 385 }, { "epoch": 0.21, "learning_rate": 9.987334818736188e-08, "logits/chosen": -2.1398894786834717, "logits/rejected": -2.273871898651123, "logps/chosen": -2.4332051277160645, "logps/rejected": -2.374919891357422, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.7267864942550659, "rewards/margins": 0.0072983503341674805, "rewards/rejected": 0.7194881439208984, "step": 386 }, { "epoch": 0.21, "learning_rate": 9.98717900769978e-08, "logits/chosen": -2.0638842582702637, "logits/rejected": -2.252941846847534, "logps/chosen": -3.764336109161377, "logps/rejected": -3.0789239406585693, "loss": 0.7373, "rewards/accuracies": 0.0, "rewards/chosen": 0.7341739535331726, "rewards/margins": -0.08645117282867432, "rewards/rejected": 0.8206251263618469, "step": 387 }, { "epoch": 0.21, "learning_rate": 9.987022245325962e-08, "logits/chosen": -2.151779890060425, "logits/rejected": -2.15834641456604, "logps/chosen": -4.649335861206055, "logps/rejected": -4.846306800842285, "loss": 0.4903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9071613550186157, "rewards/margins": 0.4575815498828888, "rewards/rejected": 0.44957980513572693, "step": 388 }, { "epoch": 0.21, "learning_rate": 9.986864531644638e-08, "logits/chosen": -2.0629241466522217, "logits/rejected": -2.0613081455230713, "logps/chosen": -3.1107237339019775, "logps/rejected": -4.946080207824707, "loss": 0.5487, "rewards/accuracies": 1.0, "rewards/chosen": 0.8734784126281738, "rewards/margins": 0.3133300542831421, "rewards/rejected": 0.5601483583450317, "step": 389 }, { "epoch": 0.21, "learning_rate": 9.986705866685893e-08, "logits/chosen": -2.121931314468384, "logits/rejected": -2.132948160171509, "logps/chosen": -2.7256247997283936, "logps/rejected": -9.091487884521484, "loss": 0.5493, "rewards/accuracies": 1.0, "rewards/chosen": 0.881767213344574, "rewards/margins": 0.3119705319404602, "rewards/rejected": 0.5697966814041138, "step": 390 }, { "epoch": 0.21, "learning_rate": 9.986546250479995e-08, "logits/chosen": -2.039560079574585, "logits/rejected": -2.268871307373047, "logps/chosen": -2.1882619857788086, "logps/rejected": -2.251537561416626, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.7991825342178345, "rewards/margins": 0.0026532411575317383, "rewards/rejected": 0.7965292930603027, "step": 391 }, { "epoch": 0.21, "learning_rate": 9.986385683057387e-08, "logits/chosen": -2.135399580001831, "logits/rejected": -2.3281362056732178, "logps/chosen": -19.280847549438477, "logps/rejected": -14.564892768859863, "loss": 0.8284, "rewards/accuracies": 0.0, "rewards/chosen": 0.2556608319282532, "rewards/margins": -0.254430890083313, "rewards/rejected": 0.5100917220115662, "step": 392 }, { "epoch": 0.21, "learning_rate": 9.986224164448704e-08, "logits/chosen": -2.0478615760803223, "logits/rejected": -2.235804796218872, "logps/chosen": -8.376404762268066, "logps/rejected": -8.522847175598145, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.5068348050117493, "rewards/margins": 0.021777242422103882, "rewards/rejected": 0.4850575625896454, "step": 393 }, { "epoch": 0.21, "learning_rate": 9.986061694684755e-08, "logits/chosen": -2.043186664581299, "logits/rejected": -2.0427052974700928, "logps/chosen": -3.3381505012512207, "logps/rejected": -6.271311283111572, "loss": 0.5168, "rewards/accuracies": 1.0, "rewards/chosen": 0.8897474408149719, "rewards/margins": 0.3905751705169678, "rewards/rejected": 0.49917227029800415, "step": 394 }, { "epoch": 0.21, "learning_rate": 9.985898273796531e-08, "logits/chosen": -2.162240982055664, "logits/rejected": -2.0854804515838623, "logps/chosen": -29.67475700378418, "logps/rejected": -4.102614879608154, "loss": 0.6508, "rewards/accuracies": 1.0, "rewards/chosen": 0.6754167675971985, "rewards/margins": 0.08651554584503174, "rewards/rejected": 0.5889012217521667, "step": 395 }, { "epoch": 0.21, "learning_rate": 9.985733901815208e-08, "logits/chosen": -2.070404529571533, "logits/rejected": -2.0703606605529785, "logps/chosen": -9.868814468383789, "logps/rejected": -3.8558061122894287, "loss": 0.4551, "rewards/accuracies": 1.0, "rewards/chosen": 1.0371812582015991, "rewards/margins": 0.5510365962982178, "rewards/rejected": 0.48614463210105896, "step": 396 }, { "epoch": 0.21, "learning_rate": 9.985568578772137e-08, "logits/chosen": -2.1408350467681885, "logits/rejected": -2.3338735103607178, "logps/chosen": -3.7957277297973633, "logps/rejected": -3.758561372756958, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.688360333442688, "rewards/margins": -0.013765692710876465, "rewards/rejected": 0.7021260261535645, "step": 397 }, { "epoch": 0.21, "learning_rate": 9.985402304698855e-08, "logits/chosen": -2.1219592094421387, "logits/rejected": -2.129807472229004, "logps/chosen": -9.452762603759766, "logps/rejected": -9.447620391845703, "loss": 0.4031, "rewards/accuracies": 1.0, "rewards/chosen": 0.984623372554779, "rewards/margins": 0.7002848386764526, "rewards/rejected": 0.2843385636806488, "step": 398 }, { "epoch": 0.22, "learning_rate": 9.985235079627086e-08, "logits/chosen": -2.0363430976867676, "logits/rejected": -2.0318410396575928, "logps/chosen": -12.447914123535156, "logps/rejected": -5.963410377502441, "loss": 0.4768, "rewards/accuracies": 1.0, "rewards/chosen": 0.8475731015205383, "rewards/margins": 0.49290332198143005, "rewards/rejected": 0.3546697795391083, "step": 399 }, { "epoch": 0.22, "learning_rate": 9.985066903588721e-08, "logits/chosen": -2.1143746376037598, "logits/rejected": -2.268526315689087, "logps/chosen": -4.4837775230407715, "logps/rejected": -4.399622440338135, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": 0.721061646938324, "rewards/margins": -0.013126850128173828, "rewards/rejected": 0.7341884970664978, "step": 400 }, { "epoch": 0.22, "learning_rate": 9.984897776615847e-08, "logits/chosen": -2.0967419147491455, "logits/rejected": -2.0965352058410645, "logps/chosen": -2.3922135829925537, "logps/rejected": -5.061074256896973, "loss": 0.6038, "rewards/accuracies": 1.0, "rewards/chosen": 0.7136354446411133, "rewards/margins": 0.18743562698364258, "rewards/rejected": 0.5261998176574707, "step": 401 }, { "epoch": 0.22, "learning_rate": 9.984727698740722e-08, "logits/chosen": -1.9866466522216797, "logits/rejected": -2.2715635299682617, "logps/chosen": -3.0716967582702637, "logps/rejected": -2.994816780090332, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.67221599817276, "rewards/margins": 0.015277981758117676, "rewards/rejected": 0.6569380164146423, "step": 402 }, { "epoch": 0.22, "learning_rate": 9.984556669995792e-08, "logits/chosen": -2.0602073669433594, "logits/rejected": -2.317941188812256, "logps/chosen": -1.6280282735824585, "logps/rejected": -4.0517354011535645, "loss": 0.6556, "rewards/accuracies": 1.0, "rewards/chosen": 0.8153498768806458, "rewards/margins": 0.07656657695770264, "rewards/rejected": 0.7387832999229431, "step": 403 }, { "epoch": 0.22, "learning_rate": 9.984384690413681e-08, "logits/chosen": -2.0079002380371094, "logits/rejected": -2.2277321815490723, "logps/chosen": -4.036055564880371, "logps/rejected": -3.6860737800598145, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.6937058568000793, "rewards/margins": 0.006637513637542725, "rewards/rejected": 0.6870683431625366, "step": 404 }, { "epoch": 0.22, "learning_rate": 9.984211760027196e-08, "logits/chosen": -2.1698553562164307, "logits/rejected": -2.3220927715301514, "logps/chosen": -3.469489812850952, "logps/rejected": -3.3792333602905273, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.791477382183075, "rewards/margins": -0.0056226253509521484, "rewards/rejected": 0.7971000075340271, "step": 405 }, { "epoch": 0.22, "learning_rate": 9.984037878869323e-08, "logits/chosen": -2.151461362838745, "logits/rejected": -2.1434805393218994, "logps/chosen": -3.5627551078796387, "logps/rejected": -7.140450477600098, "loss": 0.5583, "rewards/accuracies": 1.0, "rewards/chosen": 0.8547453284263611, "rewards/margins": 0.29064351320266724, "rewards/rejected": 0.5641018152236938, "step": 406 }, { "epoch": 0.22, "learning_rate": 9.983863046973233e-08, "logits/chosen": -2.123530149459839, "logits/rejected": -2.3047220706939697, "logps/chosen": -2.223252534866333, "logps/rejected": -2.3269686698913574, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.7447461485862732, "rewards/margins": 0.003630697727203369, "rewards/rejected": 0.7411154508590698, "step": 407 }, { "epoch": 0.22, "learning_rate": 9.983687264372273e-08, "logits/chosen": -2.1442630290985107, "logits/rejected": -2.145185947418213, "logps/chosen": -4.264191627502441, "logps/rejected": -6.1248016357421875, "loss": 0.5952, "rewards/accuracies": 1.0, "rewards/chosen": 0.5912689566612244, "rewards/margins": 0.20650628209114075, "rewards/rejected": 0.3847626745700836, "step": 408 }, { "epoch": 0.22, "learning_rate": 9.983510531099978e-08, "logits/chosen": -2.1856510639190674, "logits/rejected": -2.1807541847229004, "logps/chosen": -7.227438449859619, "logps/rejected": -11.186532974243164, "loss": 0.4947, "rewards/accuracies": 1.0, "rewards/chosen": 0.8752954602241516, "rewards/margins": 0.44625741243362427, "rewards/rejected": 0.42903804779052734, "step": 409 }, { "epoch": 0.22, "learning_rate": 9.98333284719006e-08, "logits/chosen": -2.136911630630493, "logits/rejected": -2.1327569484710693, "logps/chosen": -7.967340469360352, "logps/rejected": -4.392407417297363, "loss": 0.5619, "rewards/accuracies": 1.0, "rewards/chosen": 0.7643909454345703, "rewards/margins": 0.28227442502975464, "rewards/rejected": 0.4821165204048157, "step": 410 }, { "epoch": 0.22, "learning_rate": 9.983154212676414e-08, "logits/chosen": -2.0812158584594727, "logits/rejected": -2.0876994132995605, "logps/chosen": -6.584956169128418, "logps/rejected": -5.030646324157715, "loss": 0.4728, "rewards/accuracies": 1.0, "rewards/chosen": 0.9953721165657043, "rewards/margins": 0.5034314393997192, "rewards/rejected": 0.4919407069683075, "step": 411 }, { "epoch": 0.22, "learning_rate": 9.982974627593115e-08, "logits/chosen": -1.9742555618286133, "logits/rejected": -2.2756268978118896, "logps/chosen": -3.8841135501861572, "logps/rejected": -4.002819061279297, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.6480936408042908, "rewards/margins": 0.023307442665100098, "rewards/rejected": 0.6247861981391907, "step": 412 }, { "epoch": 0.22, "learning_rate": 9.98279409197442e-08, "logits/chosen": -2.0447545051574707, "logits/rejected": -2.085857629776001, "logps/chosen": -9.440020561218262, "logps/rejected": -20.514556884765625, "loss": 0.555, "rewards/accuracies": 1.0, "rewards/chosen": 0.7527133226394653, "rewards/margins": 0.29850322008132935, "rewards/rejected": 0.454210102558136, "step": 413 }, { "epoch": 0.22, "learning_rate": 9.982612605854765e-08, "logits/chosen": -2.142031192779541, "logits/rejected": -2.3309104442596436, "logps/chosen": -2.728351593017578, "logps/rejected": -2.7435667514801025, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.8178665041923523, "rewards/margins": 0.013903379440307617, "rewards/rejected": 0.8039631247520447, "step": 414 }, { "epoch": 0.22, "learning_rate": 9.982430169268774e-08, "logits/chosen": -2.1986136436462402, "logits/rejected": -2.2997026443481445, "logps/chosen": -3.409374713897705, "logps/rejected": -3.343783378601074, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.7837187647819519, "rewards/margins": 0.010180354118347168, "rewards/rejected": 0.7735384106636047, "step": 415 }, { "epoch": 0.22, "learning_rate": 9.982246782251247e-08, "logits/chosen": -2.0496246814727783, "logits/rejected": -2.098879337310791, "logps/chosen": -6.493721008300781, "logps/rejected": -11.687919616699219, "loss": 0.4943, "rewards/accuracies": 1.0, "rewards/chosen": 1.0077924728393555, "rewards/margins": 0.4473024010658264, "rewards/rejected": 0.560490071773529, "step": 416 }, { "epoch": 0.22, "learning_rate": 9.982062444837165e-08, "logits/chosen": -2.0653069019317627, "logits/rejected": -2.064042329788208, "logps/chosen": -2.621102809906006, "logps/rejected": -5.503194808959961, "loss": 0.534, "rewards/accuracies": 1.0, "rewards/chosen": 0.9282947778701782, "rewards/margins": 0.3484662175178528, "rewards/rejected": 0.5798285603523254, "step": 417 }, { "epoch": 0.23, "learning_rate": 9.98187715706169e-08, "logits/chosen": -2.0645408630371094, "logits/rejected": -2.0413031578063965, "logps/chosen": -21.89725685119629, "logps/rejected": -2.810598611831665, "loss": 0.5365, "rewards/accuracies": 1.0, "rewards/chosen": 0.8308544158935547, "rewards/margins": 0.3425828814506531, "rewards/rejected": 0.4882715344429016, "step": 418 }, { "epoch": 0.23, "learning_rate": 9.981690918960171e-08, "logits/chosen": -2.0896048545837402, "logits/rejected": -2.3195528984069824, "logps/chosen": -2.932534694671631, "logps/rejected": -2.9320437908172607, "loss": 0.7094, "rewards/accuracies": 0.0, "rewards/chosen": 0.6450456380844116, "rewards/margins": -0.03216654062271118, "rewards/rejected": 0.6772121787071228, "step": 419 }, { "epoch": 0.23, "learning_rate": 9.981503730568131e-08, "logits/chosen": -2.094231128692627, "logits/rejected": -2.245375871658325, "logps/chosen": -1.6170310974121094, "logps/rejected": -2.6192009449005127, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.752236008644104, "rewards/margins": 0.01853811740875244, "rewards/rejected": 0.7336978912353516, "step": 420 }, { "epoch": 0.23, "learning_rate": 9.98131559192128e-08, "logits/chosen": -2.0840985774993896, "logits/rejected": -2.253838300704956, "logps/chosen": -2.0068681240081787, "logps/rejected": -1.9921334981918335, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.6646957397460938, "rewards/margins": 0.001001119613647461, "rewards/rejected": 0.6636946201324463, "step": 421 }, { "epoch": 0.23, "learning_rate": 9.981126503055503e-08, "logits/chosen": -2.064401865005493, "logits/rejected": -2.2643980979919434, "logps/chosen": -1.9727978706359863, "logps/rejected": -1.859372615814209, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.6942542791366577, "rewards/margins": 0.008623063564300537, "rewards/rejected": 0.6856312155723572, "step": 422 }, { "epoch": 0.23, "learning_rate": 9.980936464006873e-08, "logits/chosen": -2.0324935913085938, "logits/rejected": -2.276947498321533, "logps/chosen": -10.48859977722168, "logps/rejected": -6.760149955749512, "loss": 0.7862, "rewards/accuracies": 0.0, "rewards/chosen": 0.7213148474693298, "rewards/margins": -0.1781919002532959, "rewards/rejected": 0.8995067477226257, "step": 423 }, { "epoch": 0.23, "learning_rate": 9.980745474811641e-08, "logits/chosen": -1.9982436895370483, "logits/rejected": -2.000450372695923, "logps/chosen": -3.8198981285095215, "logps/rejected": -4.736242294311523, "loss": 0.5264, "rewards/accuracies": 1.0, "rewards/chosen": 0.7662455439567566, "rewards/margins": 0.3670257329940796, "rewards/rejected": 0.399219810962677, "step": 424 }, { "epoch": 0.23, "learning_rate": 9.980553535506238e-08, "logits/chosen": -2.0580742359161377, "logits/rejected": -2.0592219829559326, "logps/chosen": -11.22733211517334, "logps/rejected": -3.751230001449585, "loss": 0.5482, "rewards/accuracies": 1.0, "rewards/chosen": 0.9091874361038208, "rewards/margins": 0.31450939178466797, "rewards/rejected": 0.5946780443191528, "step": 425 }, { "epoch": 0.23, "learning_rate": 9.980360646127277e-08, "logits/chosen": -2.1664133071899414, "logits/rejected": -2.280364990234375, "logps/chosen": -8.069746017456055, "logps/rejected": -29.119625091552734, "loss": 0.4025, "rewards/accuracies": 1.0, "rewards/chosen": 0.9145078659057617, "rewards/margins": 0.7020219564437866, "rewards/rejected": 0.2124858945608139, "step": 426 }, { "epoch": 0.23, "learning_rate": 9.980166806711556e-08, "logits/chosen": -2.058370351791382, "logits/rejected": -2.311117649078369, "logps/chosen": -2.0570716857910156, "logps/rejected": -2.1176505088806152, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.6307997107505798, "rewards/margins": 0.02253866195678711, "rewards/rejected": 0.6082610487937927, "step": 427 }, { "epoch": 0.23, "learning_rate": 9.979972017296049e-08, "logits/chosen": -2.2197868824005127, "logits/rejected": -2.32906436920166, "logps/chosen": -1.9980528354644775, "logps/rejected": -1.9608644247055054, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.823483407497406, "rewards/margins": -0.0010837316513061523, "rewards/rejected": 0.8245671391487122, "step": 428 }, { "epoch": 0.23, "learning_rate": 9.979776277917914e-08, "logits/chosen": -2.052673578262329, "logits/rejected": -2.2673370838165283, "logps/chosen": -1.927323818206787, "logps/rejected": -1.8049702644348145, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.80377596616745, "rewards/margins": 0.024624347686767578, "rewards/rejected": 0.7791516184806824, "step": 429 }, { "epoch": 0.23, "learning_rate": 9.979579588614488e-08, "logits/chosen": -1.9916138648986816, "logits/rejected": -2.000251054763794, "logps/chosen": -7.234183311462402, "logps/rejected": -5.836813449859619, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 0.9926482439041138, "rewards/margins": 0.4871888756752014, "rewards/rejected": 0.5054593682289124, "step": 430 }, { "epoch": 0.23, "learning_rate": 9.979381949423296e-08, "logits/chosen": -2.166220188140869, "logits/rejected": -2.1609365940093994, "logps/chosen": -8.895788192749023, "logps/rejected": -6.570865631103516, "loss": 0.4788, "rewards/accuracies": 1.0, "rewards/chosen": 0.9206754565238953, "rewards/margins": 0.4874317944049835, "rewards/rejected": 0.43324366211891174, "step": 431 }, { "epoch": 0.23, "learning_rate": 9.979183360382031e-08, "logits/chosen": -2.017259359359741, "logits/rejected": -2.0236544609069824, "logps/chosen": -3.032038688659668, "logps/rejected": -5.2716755867004395, "loss": 0.508, "rewards/accuracies": 1.0, "rewards/chosen": 0.8700335621833801, "rewards/margins": 0.4125102460384369, "rewards/rejected": 0.45752331614494324, "step": 432 }, { "epoch": 0.23, "learning_rate": 9.978983821528581e-08, "logits/chosen": -2.0137171745300293, "logits/rejected": -2.0198330879211426, "logps/chosen": -4.383842468261719, "logps/rejected": -4.927996635437012, "loss": 0.5222, "rewards/accuracies": 1.0, "rewards/chosen": 0.8487855792045593, "rewards/margins": 0.37738844752311707, "rewards/rejected": 0.47139713168144226, "step": 433 }, { "epoch": 0.23, "learning_rate": 9.978783332901008e-08, "logits/chosen": -2.0727992057800293, "logits/rejected": -2.0794692039489746, "logps/chosen": -10.900613784790039, "logps/rejected": -1.9045758247375488, "loss": 0.6349, "rewards/accuracies": 1.0, "rewards/chosen": 0.8293083310127258, "rewards/margins": 0.120114266872406, "rewards/rejected": 0.7091940641403198, "step": 434 }, { "epoch": 0.23, "learning_rate": 9.978581894537557e-08, "logits/chosen": -2.076225519180298, "logits/rejected": -2.072033405303955, "logps/chosen": -7.534587860107422, "logps/rejected": -10.825624465942383, "loss": 0.6547, "rewards/accuracies": 1.0, "rewards/chosen": 0.9244462847709656, "rewards/margins": 0.0784345269203186, "rewards/rejected": 0.846011757850647, "step": 435 }, { "epoch": 0.24, "learning_rate": 9.978379506476653e-08, "logits/chosen": -1.949532151222229, "logits/rejected": -1.9490301609039307, "logps/chosen": -2.1614651679992676, "logps/rejected": -2.0390076637268066, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.724165141582489, "rewards/margins": 0.03261244297027588, "rewards/rejected": 0.6915526986122131, "step": 436 }, { "epoch": 0.24, "learning_rate": 9.978176168756902e-08, "logits/chosen": -2.063060760498047, "logits/rejected": -2.3162879943847656, "logps/chosen": -4.1569743156433105, "logps/rejected": -1.9966744184494019, "loss": 0.72, "rewards/accuracies": 0.0, "rewards/chosen": 0.5181841254234314, "rewards/margins": -0.05300414562225342, "rewards/rejected": 0.5711882710456848, "step": 437 }, { "epoch": 0.24, "learning_rate": 9.977971881417093e-08, "logits/chosen": -2.135620594024658, "logits/rejected": -2.1285560131073, "logps/chosen": -7.194052696228027, "logps/rejected": -5.064713954925537, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 0.982505738735199, "rewards/margins": 0.0376744270324707, "rewards/rejected": 0.9448313117027283, "step": 438 }, { "epoch": 0.24, "learning_rate": 9.977766644496195e-08, "logits/chosen": -2.167482852935791, "logits/rejected": -2.2763586044311523, "logps/chosen": -4.794898986816406, "logps/rejected": -4.712972164154053, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.7107790112495422, "rewards/margins": 0.0038718581199645996, "rewards/rejected": 0.7069071531295776, "step": 439 }, { "epoch": 0.24, "learning_rate": 9.977560458033358e-08, "logits/chosen": -2.0482590198516846, "logits/rejected": -2.0530831813812256, "logps/chosen": -3.3534696102142334, "logps/rejected": -2.5495259761810303, "loss": 0.5739, "rewards/accuracies": 1.0, "rewards/chosen": 0.8633958101272583, "rewards/margins": 0.2546854019165039, "rewards/rejected": 0.6087104082107544, "step": 440 }, { "epoch": 0.24, "learning_rate": 9.977353322067914e-08, "logits/chosen": -2.042353868484497, "logits/rejected": -2.2948074340820312, "logps/chosen": -15.432228088378906, "logps/rejected": -16.06153678894043, "loss": 0.6697, "rewards/accuracies": 1.0, "rewards/chosen": 0.5361666083335876, "rewards/margins": 0.047475665807724, "rewards/rejected": 0.48869094252586365, "step": 441 }, { "epoch": 0.24, "learning_rate": 9.977145236639375e-08, "logits/chosen": -2.011747360229492, "logits/rejected": -2.226856231689453, "logps/chosen": -1.7276816368103027, "logps/rejected": -1.6767313480377197, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.7714030146598816, "rewards/margins": -0.005166828632354736, "rewards/rejected": 0.7765698432922363, "step": 442 }, { "epoch": 0.24, "learning_rate": 9.976936201787436e-08, "logits/chosen": -2.0492377281188965, "logits/rejected": -2.056122303009033, "logps/chosen": -3.125521183013916, "logps/rejected": -3.7291533946990967, "loss": 0.4489, "rewards/accuracies": 1.0, "rewards/chosen": 1.063998818397522, "rewards/margins": 0.5679853558540344, "rewards/rejected": 0.49601346254348755, "step": 443 }, { "epoch": 0.24, "learning_rate": 9.97672621755197e-08, "logits/chosen": -2.202601909637451, "logits/rejected": -2.098297119140625, "logps/chosen": -31.97630500793457, "logps/rejected": -20.419300079345703, "loss": 0.8512, "rewards/accuracies": 0.0, "rewards/chosen": 0.09579449146986008, "rewards/margins": -0.2944595217704773, "rewards/rejected": 0.39025402069091797, "step": 444 }, { "epoch": 0.24, "learning_rate": 9.976515283973033e-08, "logits/chosen": -2.0879228115081787, "logits/rejected": -2.2620983123779297, "logps/chosen": -2.316990375518799, "logps/rejected": -2.2499165534973145, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.8769640326499939, "rewards/margins": 5.704164505004883e-05, "rewards/rejected": 0.8769069910049438, "step": 445 }, { "epoch": 0.24, "learning_rate": 9.976303401090864e-08, "logits/chosen": -2.304643154144287, "logits/rejected": -2.2543303966522217, "logps/chosen": -7.5876970291137695, "logps/rejected": -7.308916091918945, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.47146472334861755, "rewards/margins": -0.0018458366394042969, "rewards/rejected": 0.47331055998802185, "step": 446 }, { "epoch": 0.24, "learning_rate": 9.976090568945879e-08, "logits/chosen": -2.102522134780884, "logits/rejected": -2.2584259510040283, "logps/chosen": -1.5771563053131104, "logps/rejected": -1.5418577194213867, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9016767740249634, "rewards/margins": -0.00014388561248779297, "rewards/rejected": 0.9018206596374512, "step": 447 }, { "epoch": 0.24, "learning_rate": 9.975876787578679e-08, "logits/chosen": -2.1618244647979736, "logits/rejected": -2.2811129093170166, "logps/chosen": -3.090043544769287, "logps/rejected": -2.8566176891326904, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.48090457916259766, "rewards/margins": -0.0053123533725738525, "rewards/rejected": 0.4862169325351715, "step": 448 }, { "epoch": 0.24, "learning_rate": 9.975662057030041e-08, "logits/chosen": -2.1081159114837646, "logits/rejected": -2.138014554977417, "logps/chosen": -6.475625038146973, "logps/rejected": -16.698123931884766, "loss": 0.4371, "rewards/accuracies": 1.0, "rewards/chosen": 0.9980325102806091, "rewards/margins": 0.6011300086975098, "rewards/rejected": 0.396902471780777, "step": 449 }, { "epoch": 0.24, "learning_rate": 9.975446377340929e-08, "logits/chosen": -2.1308236122131348, "logits/rejected": -2.130605936050415, "logps/chosen": -2.569880962371826, "logps/rejected": -1.5589611530303955, "loss": 0.6223, "rewards/accuracies": 1.0, "rewards/chosen": 0.8941397070884705, "rewards/margins": 0.1470428705215454, "rewards/rejected": 0.747096836566925, "step": 450 }, { "epoch": 0.24, "learning_rate": 9.975229748552485e-08, "logits/chosen": -1.9891635179519653, "logits/rejected": -2.258558988571167, "logps/chosen": -1.6484545469284058, "logps/rejected": -1.6250563859939575, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.7930322885513306, "rewards/margins": 0.02594006061553955, "rewards/rejected": 0.767092227935791, "step": 451 }, { "epoch": 0.24, "learning_rate": 9.975012170706032e-08, "logits/chosen": -1.9706417322158813, "logits/rejected": -2.295290946960449, "logps/chosen": -1.2387933731079102, "logps/rejected": -1.3617042303085327, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7204845547676086, "rewards/margins": 0.01764547824859619, "rewards/rejected": 0.7028390765190125, "step": 452 }, { "epoch": 0.24, "learning_rate": 9.974793643843075e-08, "logits/chosen": -2.0543413162231445, "logits/rejected": -2.295105457305908, "logps/chosen": -3.9165992736816406, "logps/rejected": -4.962995529174805, "loss": 0.6496, "rewards/accuracies": 1.0, "rewards/chosen": 0.8469523787498474, "rewards/margins": 0.08914893865585327, "rewards/rejected": 0.7578034400939941, "step": 453 }, { "epoch": 0.24, "learning_rate": 9.974574168005299e-08, "logits/chosen": -2.0970191955566406, "logits/rejected": -2.228581190109253, "logps/chosen": -1.6876097917556763, "logps/rejected": -1.5945892333984375, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.8266090750694275, "rewards/margins": 0.0015779733657836914, "rewards/rejected": 0.8250311017036438, "step": 454 }, { "epoch": 0.25, "learning_rate": 9.974353743234569e-08, "logits/chosen": -2.010296106338501, "logits/rejected": -2.240121841430664, "logps/chosen": -2.424638509750366, "logps/rejected": -2.5155913829803467, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.8734163641929626, "rewards/margins": 0.03832411766052246, "rewards/rejected": 0.8350922465324402, "step": 455 }, { "epoch": 0.25, "learning_rate": 9.974132369572934e-08, "logits/chosen": -2.068171501159668, "logits/rejected": -2.289228677749634, "logps/chosen": -8.922321319580078, "logps/rejected": -8.746259689331055, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.5717716217041016, "rewards/margins": 0.014756739139556885, "rewards/rejected": 0.5570148825645447, "step": 456 }, { "epoch": 0.25, "learning_rate": 9.973910047062622e-08, "logits/chosen": -2.0385665893554688, "logits/rejected": -2.2735509872436523, "logps/chosen": -3.979814291000366, "logps/rejected": -3.7626774311065674, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.6351802945137024, "rewards/margins": 0.00019872188568115234, "rewards/rejected": 0.6349815726280212, "step": 457 }, { "epoch": 0.25, "learning_rate": 9.973686775746044e-08, "logits/chosen": -2.0795764923095703, "logits/rejected": -2.080852508544922, "logps/chosen": -4.79740571975708, "logps/rejected": -3.5860683917999268, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7868708968162537, "rewards/margins": 0.01751530170440674, "rewards/rejected": 0.7693555951118469, "step": 458 }, { "epoch": 0.25, "learning_rate": 9.973462555665789e-08, "logits/chosen": -1.9658246040344238, "logits/rejected": -1.9656798839569092, "logps/chosen": -3.8841304779052734, "logps/rejected": -1.4944393634796143, "loss": 0.6501, "rewards/accuracies": 1.0, "rewards/chosen": 0.9830417633056641, "rewards/margins": 0.087954580783844, "rewards/rejected": 0.8950871825218201, "step": 459 }, { "epoch": 0.25, "learning_rate": 9.973237386864628e-08, "logits/chosen": -2.1481924057006836, "logits/rejected": -2.3161449432373047, "logps/chosen": -10.641952514648438, "logps/rejected": -13.279048919677734, "loss": 0.7482, "rewards/accuracies": 0.0, "rewards/chosen": 0.604146420955658, "rewards/margins": -0.10715919733047485, "rewards/rejected": 0.7113056182861328, "step": 460 }, { "epoch": 0.25, "learning_rate": 9.973011269385517e-08, "logits/chosen": -2.1120524406433105, "logits/rejected": -2.287562131881714, "logps/chosen": -3.834028720855713, "logps/rejected": -3.4285459518432617, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.5545203685760498, "rewards/margins": 0.027272343635559082, "rewards/rejected": 0.5272480249404907, "step": 461 }, { "epoch": 0.25, "learning_rate": 9.972784203271583e-08, "logits/chosen": -2.0395145416259766, "logits/rejected": -2.224306583404541, "logps/chosen": -1.6216497421264648, "logps/rejected": -1.6675790548324585, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.8619868159294128, "rewards/margins": 0.02966153621673584, "rewards/rejected": 0.832325279712677, "step": 462 }, { "epoch": 0.25, "learning_rate": 9.972556188566146e-08, "logits/chosen": -2.0310628414154053, "logits/rejected": -2.040407657623291, "logps/chosen": -2.3851394653320312, "logps/rejected": -4.571592807769775, "loss": 0.4831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9396665692329407, "rewards/margins": 0.476164311170578, "rewards/rejected": 0.46350225806236267, "step": 463 }, { "epoch": 0.25, "learning_rate": 9.972327225312697e-08, "logits/chosen": -1.9968258142471313, "logits/rejected": -1.9946770668029785, "logps/chosen": -1.0802806615829468, "logps/rejected": -10.950346946716309, "loss": 0.4837, "rewards/accuracies": 1.0, "rewards/chosen": 0.8727330565452576, "rewards/margins": 0.47467634081840515, "rewards/rejected": 0.3980567157268524, "step": 464 }, { "epoch": 0.25, "learning_rate": 9.972097313554917e-08, "logits/chosen": -2.000396728515625, "logits/rejected": -2.251315116882324, "logps/chosen": -2.3843886852264404, "logps/rejected": -2.285069465637207, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.8560781478881836, "rewards/margins": 0.03603845834732056, "rewards/rejected": 0.820039689540863, "step": 465 }, { "epoch": 0.25, "learning_rate": 9.971866453336661e-08, "logits/chosen": -2.0532283782958984, "logits/rejected": -2.2709765434265137, "logps/chosen": -1.7854244709014893, "logps/rejected": -1.8914048671722412, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.9319106936454773, "rewards/margins": 0.034234583377838135, "rewards/rejected": 0.8976761102676392, "step": 466 }, { "epoch": 0.25, "learning_rate": 9.971634644701965e-08, "logits/chosen": -2.034447193145752, "logits/rejected": -2.0275461673736572, "logps/chosen": -10.046466827392578, "logps/rejected": -7.6183390617370605, "loss": 0.4567, "rewards/accuracies": 1.0, "rewards/chosen": 1.133826494216919, "rewards/margins": 0.5467008352279663, "rewards/rejected": 0.5871256589889526, "step": 467 }, { "epoch": 0.25, "learning_rate": 9.97140188769505e-08, "logits/chosen": -1.9790337085723877, "logits/rejected": -1.9793775081634521, "logps/chosen": -1.7754833698272705, "logps/rejected": -1.9986093044281006, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8772280812263489, "rewards/margins": 0.017977118492126465, "rewards/rejected": 0.8592509627342224, "step": 468 }, { "epoch": 0.25, "learning_rate": 9.971168182360316e-08, "logits/chosen": -1.9570367336273193, "logits/rejected": -2.2026352882385254, "logps/chosen": -2.3606739044189453, "logps/rejected": -2.2939491271972656, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.7021316885948181, "rewards/margins": 0.012644410133361816, "rewards/rejected": 0.6894872784614563, "step": 469 }, { "epoch": 0.25, "learning_rate": 9.970933528742345e-08, "logits/chosen": -2.0577328205108643, "logits/rejected": -2.0618417263031006, "logps/chosen": -2.2682137489318848, "logps/rejected": -4.005801677703857, "loss": 0.4764, "rewards/accuracies": 1.0, "rewards/chosen": 1.0189346075057983, "rewards/margins": 0.4937392473220825, "rewards/rejected": 0.5251953601837158, "step": 470 }, { "epoch": 0.25, "learning_rate": 9.970697926885898e-08, "logits/chosen": -2.0704143047332764, "logits/rejected": -2.1133060455322266, "logps/chosen": -7.374026298522949, "logps/rejected": -23.055770874023438, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 0.961675763130188, "rewards/margins": 1.024768352508545, "rewards/rejected": -0.06309261173009872, "step": 471 }, { "epoch": 0.25, "learning_rate": 9.970461376835914e-08, "logits/chosen": -2.079589605331421, "logits/rejected": -2.2643392086029053, "logps/chosen": -6.022359371185303, "logps/rejected": -1.0911918878555298, "loss": 0.8001, "rewards/accuracies": 0.0, "rewards/chosen": 0.5955221652984619, "rewards/margins": -0.2035212516784668, "rewards/rejected": 0.7990434169769287, "step": 472 }, { "epoch": 0.26, "learning_rate": 9.970223878637522e-08, "logits/chosen": -2.2108969688415527, "logits/rejected": -2.21069598197937, "logps/chosen": -1.4748897552490234, "logps/rejected": -2.6747238636016846, "loss": 0.536, "rewards/accuracies": 1.0, "rewards/chosen": 0.8906490206718445, "rewards/margins": 0.3437727689743042, "rewards/rejected": 0.5468762516975403, "step": 473 }, { "epoch": 0.26, "learning_rate": 9.969985432336023e-08, "logits/chosen": -2.121185779571533, "logits/rejected": -2.0720863342285156, "logps/chosen": -24.38098907470703, "logps/rejected": -3.854832172393799, "loss": 0.5083, "rewards/accuracies": 1.0, "rewards/chosen": 0.986484944820404, "rewards/margins": 0.41182297468185425, "rewards/rejected": 0.5746619701385498, "step": 474 }, { "epoch": 0.26, "learning_rate": 9.969746037976903e-08, "logits/chosen": -2.073930501937866, "logits/rejected": -2.2594852447509766, "logps/chosen": -2.5120925903320312, "logps/rejected": -2.5083487033843994, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.7136349081993103, "rewards/margins": 0.004547119140625, "rewards/rejected": 0.7090877890586853, "step": 475 }, { "epoch": 0.26, "learning_rate": 9.969505695605826e-08, "logits/chosen": -2.199580669403076, "logits/rejected": -2.243535280227661, "logps/chosen": -10.456693649291992, "logps/rejected": -5.949392318725586, "loss": 0.7759, "rewards/accuracies": 0.0, "rewards/chosen": 0.6272026300430298, "rewards/margins": -0.15908700227737427, "rewards/rejected": 0.786289632320404, "step": 476 }, { "epoch": 0.26, "learning_rate": 9.969264405268645e-08, "logits/chosen": -2.082736015319824, "logits/rejected": -2.011274814605713, "logps/chosen": -54.00884246826172, "logps/rejected": -3.381653308868408, "loss": 0.6701, "rewards/accuracies": 1.0, "rewards/chosen": 0.442962646484375, "rewards/margins": 0.046560078859329224, "rewards/rejected": 0.3964025676250458, "step": 477 }, { "epoch": 0.26, "learning_rate": 9.96902216701138e-08, "logits/chosen": -2.282580852508545, "logits/rejected": -2.291097640991211, "logps/chosen": -5.841450214385986, "logps/rejected": -3.9357972145080566, "loss": 0.4653, "rewards/accuracies": 1.0, "rewards/chosen": 1.2113593816757202, "rewards/margins": 0.5234247446060181, "rewards/rejected": 0.6879346370697021, "step": 478 }, { "epoch": 0.26, "learning_rate": 9.968778980880246e-08, "logits/chosen": -2.23486590385437, "logits/rejected": -2.204024314880371, "logps/chosen": -32.55116653442383, "logps/rejected": -33.240570068359375, "loss": 0.5732, "rewards/accuracies": 1.0, "rewards/chosen": 0.6599506735801697, "rewards/margins": 0.25619587302207947, "rewards/rejected": 0.4037548005580902, "step": 479 }, { "epoch": 0.26, "learning_rate": 9.968534846921628e-08, "logits/chosen": -2.145914077758789, "logits/rejected": -2.1343905925750732, "logps/chosen": -4.003479957580566, "logps/rejected": -6.598451614379883, "loss": 0.4935, "rewards/accuracies": 1.0, "rewards/chosen": 0.931292712688446, "rewards/margins": 0.44924572110176086, "rewards/rejected": 0.4820469915866852, "step": 480 }, { "epoch": 0.26, "learning_rate": 9.968289765182099e-08, "logits/chosen": -2.1195685863494873, "logits/rejected": -2.025080442428589, "logps/chosen": -25.549135208129883, "logps/rejected": -5.239275932312012, "loss": 0.5996, "rewards/accuracies": 1.0, "rewards/chosen": 0.8469865918159485, "rewards/margins": 0.1968051791191101, "rewards/rejected": 0.6501814126968384, "step": 481 }, { "epoch": 0.26, "learning_rate": 9.968043735708409e-08, "logits/chosen": -1.979612946510315, "logits/rejected": -2.2443888187408447, "logps/chosen": -1.9878685474395752, "logps/rejected": -7.402090072631836, "loss": 0.5822, "rewards/accuracies": 1.0, "rewards/chosen": 0.7412760853767395, "rewards/margins": 0.2357380986213684, "rewards/rejected": 0.5055379867553711, "step": 482 }, { "epoch": 0.26, "learning_rate": 9.967796758547489e-08, "logits/chosen": -1.9425742626190186, "logits/rejected": -2.2596421241760254, "logps/chosen": -1.2705352306365967, "logps/rejected": -1.2786238193511963, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.7665300369262695, "rewards/margins": 0.017350971698760986, "rewards/rejected": 0.7491790652275085, "step": 483 }, { "epoch": 0.26, "learning_rate": 9.967548833746451e-08, "logits/chosen": -2.22908091545105, "logits/rejected": -2.232147693634033, "logps/chosen": -1.9542385339736938, "logps/rejected": -3.2155532836914062, "loss": 0.5864, "rewards/accuracies": 1.0, "rewards/chosen": 0.5813643336296082, "rewards/margins": 0.22634479403495789, "rewards/rejected": 0.35501953959465027, "step": 484 }, { "epoch": 0.26, "learning_rate": 9.967299961352591e-08, "logits/chosen": -2.1612939834594727, "logits/rejected": -2.328944683074951, "logps/chosen": -7.94711971282959, "logps/rejected": -6.272340774536133, "loss": 0.755, "rewards/accuracies": 0.0, "rewards/chosen": 0.3532938063144684, "rewards/margins": -0.12003079056739807, "rewards/rejected": 0.47332459688186646, "step": 485 }, { "epoch": 0.26, "learning_rate": 9.96705014141338e-08, "logits/chosen": -2.0847625732421875, "logits/rejected": -2.30546236038208, "logps/chosen": -6.517168045043945, "logps/rejected": -2.2224113941192627, "loss": 0.7317, "rewards/accuracies": 0.0, "rewards/chosen": 0.7324533462524414, "rewards/margins": -0.07568579912185669, "rewards/rejected": 0.8081391453742981, "step": 486 }, { "epoch": 0.26, "learning_rate": 9.966799373976475e-08, "logits/chosen": -2.276792287826538, "logits/rejected": -2.4028401374816895, "logps/chosen": -15.3724365234375, "logps/rejected": -18.40525245666504, "loss": 0.7086, "rewards/accuracies": 0.0, "rewards/chosen": 0.44261056184768677, "rewards/margins": -0.030620187520980835, "rewards/rejected": 0.4732307493686676, "step": 487 }, { "epoch": 0.26, "learning_rate": 9.96654765908971e-08, "logits/chosen": -2.0566914081573486, "logits/rejected": -2.0524890422821045, "logps/chosen": -8.29028034210205, "logps/rejected": -3.8002898693084717, "loss": 0.5689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9795041084289551, "rewards/margins": 0.26620346307754517, "rewards/rejected": 0.7133006453514099, "step": 488 }, { "epoch": 0.26, "learning_rate": 9.966294996801103e-08, "logits/chosen": -2.160601854324341, "logits/rejected": -2.2917442321777344, "logps/chosen": -1.4250173568725586, "logps/rejected": -1.4144635200500488, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8464785814285278, "rewards/margins": 0.0021033287048339844, "rewards/rejected": 0.8443752527236938, "step": 489 }, { "epoch": 0.26, "learning_rate": 9.96604138715885e-08, "logits/chosen": -2.0686655044555664, "logits/rejected": -2.0710642337799072, "logps/chosen": -1.6825578212738037, "logps/rejected": -11.184167861938477, "loss": 0.5997, "rewards/accuracies": 1.0, "rewards/chosen": 0.9228883981704712, "rewards/margins": 0.19657593965530396, "rewards/rejected": 0.7263124585151672, "step": 490 }, { "epoch": 0.26, "learning_rate": 9.96578683021133e-08, "logits/chosen": -1.9967176914215088, "logits/rejected": -2.192199230194092, "logps/chosen": -2.771764039993286, "logps/rejected": -3.0130319595336914, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8639017939567566, "rewards/margins": 0.008771300315856934, "rewards/rejected": 0.8551304936408997, "step": 491 }, { "epoch": 0.27, "learning_rate": 9.965531326007097e-08, "logits/chosen": -2.0738461017608643, "logits/rejected": -2.0704596042633057, "logps/chosen": -14.18697738647461, "logps/rejected": -13.187575340270996, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.7809244394302368, "rewards/margins": 0.007597446441650391, "rewards/rejected": 0.7733269929885864, "step": 492 }, { "epoch": 0.27, "learning_rate": 9.965274874594896e-08, "logits/chosen": -2.0330374240875244, "logits/rejected": -2.306570053100586, "logps/chosen": -2.012200355529785, "logps/rejected": -1.911266565322876, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8603904843330383, "rewards/margins": 0.004322230815887451, "rewards/rejected": 0.8560682535171509, "step": 493 }, { "epoch": 0.27, "learning_rate": 9.965017476023643e-08, "logits/chosen": -2.0713040828704834, "logits/rejected": -2.075913429260254, "logps/chosen": -2.180994987487793, "logps/rejected": -2.344054937362671, "loss": 0.511, "rewards/accuracies": 1.0, "rewards/chosen": 1.0768215656280518, "rewards/margins": 0.4049106240272522, "rewards/rejected": 0.6719109416007996, "step": 494 }, { "epoch": 0.27, "learning_rate": 9.96475913034244e-08, "logits/chosen": -2.2045750617980957, "logits/rejected": -2.206345796585083, "logps/chosen": -1.1548700332641602, "logps/rejected": -3.210948944091797, "loss": 0.5193, "rewards/accuracies": 1.0, "rewards/chosen": 0.9568726420402527, "rewards/margins": 0.384474515914917, "rewards/rejected": 0.5723981261253357, "step": 495 }, { "epoch": 0.27, "learning_rate": 9.964499837600568e-08, "logits/chosen": -2.0023765563964844, "logits/rejected": -2.256598472595215, "logps/chosen": -3.7410454750061035, "logps/rejected": -3.410052537918091, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.4081997573375702, "rewards/margins": 0.007293671369552612, "rewards/rejected": 0.4009060859680176, "step": 496 }, { "epoch": 0.27, "learning_rate": 9.964239597847487e-08, "logits/chosen": -2.0372793674468994, "logits/rejected": -2.266225814819336, "logps/chosen": -5.095218658447266, "logps/rejected": -3.818582534790039, "loss": 0.7368, "rewards/accuracies": 0.0, "rewards/chosen": 0.6420837640762329, "rewards/margins": -0.0855112075805664, "rewards/rejected": 0.7275949716567993, "step": 497 }, { "epoch": 0.27, "learning_rate": 9.963978411132843e-08, "logits/chosen": -1.9594179391860962, "logits/rejected": -1.9653570652008057, "logps/chosen": -1.0923559665679932, "logps/rejected": -6.9750075340271, "loss": 0.5065, "rewards/accuracies": 1.0, "rewards/chosen": 0.9775258898735046, "rewards/margins": 0.4161931276321411, "rewards/rejected": 0.5613327622413635, "step": 498 }, { "epoch": 0.27, "learning_rate": 9.963716277506455e-08, "logits/chosen": -2.1491124629974365, "logits/rejected": -2.160266637802124, "logps/chosen": -4.941389083862305, "logps/rejected": -4.75853967666626, "loss": 0.5888, "rewards/accuracies": 1.0, "rewards/chosen": 1.1286089420318604, "rewards/margins": 0.22082388401031494, "rewards/rejected": 0.9077850580215454, "step": 499 }, { "epoch": 0.27, "learning_rate": 9.963453197018332e-08, "logits/chosen": -2.169297456741333, "logits/rejected": -2.258082866668701, "logps/chosen": -4.491331577301025, "logps/rejected": -8.777767181396484, "loss": 0.6254, "rewards/accuracies": 1.0, "rewards/chosen": 0.9313102960586548, "rewards/margins": 0.14051800966262817, "rewards/rejected": 0.7907922863960266, "step": 500 }, { "epoch": 0.27, "learning_rate": 9.963189169718654e-08, "logits/chosen": -2.187497615814209, "logits/rejected": -2.1944990158081055, "logps/chosen": -2.96187686920166, "logps/rejected": -3.7968554496765137, "loss": 0.5223, "rewards/accuracies": 1.0, "rewards/chosen": 0.8258067965507507, "rewards/margins": 0.37690746784210205, "rewards/rejected": 0.4488993287086487, "step": 501 }, { "epoch": 0.27, "learning_rate": 9.962924195657785e-08, "logits/chosen": -2.0535507202148438, "logits/rejected": -2.2587194442749023, "logps/chosen": -2.157545804977417, "logps/rejected": -2.0771780014038086, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.6812862753868103, "rewards/margins": 0.02838873863220215, "rewards/rejected": 0.6528975367546082, "step": 502 }, { "epoch": 0.27, "learning_rate": 9.962658274886274e-08, "logits/chosen": -2.1350769996643066, "logits/rejected": -2.1462090015411377, "logps/chosen": -6.326963424682617, "logps/rejected": -4.298957347869873, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 1.0317491292953491, "rewards/margins": 0.19897037744522095, "rewards/rejected": 0.8327787518501282, "step": 503 }, { "epoch": 0.27, "learning_rate": 9.962391407454847e-08, "logits/chosen": -2.0537848472595215, "logits/rejected": -2.3065242767333984, "logps/chosen": -2.2690136432647705, "logps/rejected": -2.4289145469665527, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.5642725229263306, "rewards/margins": 0.03341704607009888, "rewards/rejected": 0.5308554768562317, "step": 504 }, { "epoch": 0.27, "learning_rate": 9.962123593414407e-08, "logits/chosen": -2.0743188858032227, "logits/rejected": -2.074130058288574, "logps/chosen": -1.85128653049469, "logps/rejected": -1.7614213228225708, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.8574023246765137, "rewards/margins": 0.03207653760910034, "rewards/rejected": 0.8253257870674133, "step": 505 }, { "epoch": 0.27, "learning_rate": 9.961854832816046e-08, "logits/chosen": -2.0186028480529785, "logits/rejected": -2.298480987548828, "logps/chosen": -2.672348737716675, "logps/rejected": -2.5457611083984375, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.8708664774894714, "rewards/margins": -0.0012570619583129883, "rewards/rejected": 0.8721235394477844, "step": 506 }, { "epoch": 0.27, "learning_rate": 9.96158512571103e-08, "logits/chosen": -2.1020781993865967, "logits/rejected": -2.098522186279297, "logps/chosen": -6.616942405700684, "logps/rejected": -4.738771438598633, "loss": 0.4954, "rewards/accuracies": 1.0, "rewards/chosen": 0.9514569640159607, "rewards/margins": 0.44456398487091064, "rewards/rejected": 0.50689297914505, "step": 507 }, { "epoch": 0.27, "learning_rate": 9.961314472150806e-08, "logits/chosen": -2.0989997386932373, "logits/rejected": -2.3081068992614746, "logps/chosen": -1.9802138805389404, "logps/rejected": -2.047621488571167, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 1.0133315324783325, "rewards/margins": 0.014625728130340576, "rewards/rejected": 0.9987058043479919, "step": 508 }, { "epoch": 0.27, "learning_rate": 9.961042872187005e-08, "logits/chosen": -2.0513763427734375, "logits/rejected": -2.056269645690918, "logps/chosen": -5.880397319793701, "logps/rejected": -3.251152515411377, "loss": 0.4902, "rewards/accuracies": 1.0, "rewards/chosen": 1.0623737573623657, "rewards/margins": 0.45788657665252686, "rewards/rejected": 0.6044871807098389, "step": 509 }, { "epoch": 0.28, "learning_rate": 9.960770325871434e-08, "logits/chosen": -2.1230385303497314, "logits/rejected": -2.21934175491333, "logps/chosen": -4.473421096801758, "logps/rejected": -4.641340732574463, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.8643550276756287, "rewards/margins": 0.02925771474838257, "rewards/rejected": 0.8350973129272461, "step": 510 }, { "epoch": 0.28, "learning_rate": 9.960496833256085e-08, "logits/chosen": -2.1803510189056396, "logits/rejected": -2.3684098720550537, "logps/chosen": -1.8583983182907104, "logps/rejected": -1.7923380136489868, "loss": 0.7034, "rewards/accuracies": 0.0, "rewards/chosen": 0.7443674206733704, "rewards/margins": -0.02034527063369751, "rewards/rejected": 0.7647126913070679, "step": 511 }, { "epoch": 0.28, "learning_rate": 9.96022239439313e-08, "logits/chosen": -2.074833869934082, "logits/rejected": -2.350281000137329, "logps/chosen": -1.0637282133102417, "logps/rejected": -1.0728968381881714, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.9459437727928162, "rewards/margins": 0.01996833086013794, "rewards/rejected": 0.9259754419326782, "step": 512 }, { "epoch": 0.28, "learning_rate": 9.959947009334916e-08, "logits/chosen": -2.0409772396087646, "logits/rejected": -2.024372100830078, "logps/chosen": -21.709728240966797, "logps/rejected": -6.154195785522461, "loss": 0.7085, "rewards/accuracies": 0.0, "rewards/chosen": 0.955997884273529, "rewards/margins": -0.030447185039520264, "rewards/rejected": 0.9864450693130493, "step": 513 }, { "epoch": 0.28, "learning_rate": 9.959670678133978e-08, "logits/chosen": -2.0203471183776855, "logits/rejected": -2.02658748626709, "logps/chosen": -5.798807144165039, "logps/rejected": -3.352917432785034, "loss": 0.5633, "rewards/accuracies": 1.0, "rewards/chosen": 0.9230179190635681, "rewards/margins": 0.2791013717651367, "rewards/rejected": 0.6439165472984314, "step": 514 }, { "epoch": 0.28, "learning_rate": 9.959393400843026e-08, "logits/chosen": -2.1376349925994873, "logits/rejected": -2.1333255767822266, "logps/chosen": -12.971020698547363, "logps/rejected": -3.8333168029785156, "loss": 0.5193, "rewards/accuracies": 1.0, "rewards/chosen": 0.9753348231315613, "rewards/margins": 0.38442057371139526, "rewards/rejected": 0.590914249420166, "step": 515 }, { "epoch": 0.28, "learning_rate": 9.959115177514952e-08, "logits/chosen": -2.104862928390503, "logits/rejected": -2.108091115951538, "logps/chosen": -1.6178388595581055, "logps/rejected": -1.8123159408569336, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 1.0086175203323364, "rewards/margins": 0.27595943212509155, "rewards/rejected": 0.7326580882072449, "step": 516 }, { "epoch": 0.28, "learning_rate": 9.958836008202834e-08, "logits/chosen": -2.0760996341705322, "logits/rejected": -2.0753114223480225, "logps/chosen": -4.531424522399902, "logps/rejected": -5.339522838592529, "loss": 0.4991, "rewards/accuracies": 1.0, "rewards/chosen": 1.0784106254577637, "rewards/margins": 0.43492060899734497, "rewards/rejected": 0.6434900164604187, "step": 517 }, { "epoch": 0.28, "learning_rate": 9.958555892959918e-08, "logits/chosen": -2.1560211181640625, "logits/rejected": -2.0330328941345215, "logps/chosen": -40.12018585205078, "logps/rejected": -20.840089797973633, "loss": 0.5093, "rewards/accuracies": 1.0, "rewards/chosen": 0.9121994376182556, "rewards/margins": 0.40918123722076416, "rewards/rejected": 0.5030182003974915, "step": 518 }, { "epoch": 0.28, "learning_rate": 9.958274831839641e-08, "logits/chosen": -2.001410484313965, "logits/rejected": -2.006889581680298, "logps/chosen": -2.3569116592407227, "logps/rejected": -3.7160534858703613, "loss": 0.5055, "rewards/accuracies": 1.0, "rewards/chosen": 0.9176625609397888, "rewards/margins": 0.41885536909103394, "rewards/rejected": 0.4988071918487549, "step": 519 }, { "epoch": 0.28, "learning_rate": 9.95799282489562e-08, "logits/chosen": -1.9987167119979858, "logits/rejected": -2.002875566482544, "logps/chosen": -3.182191848754883, "logps/rejected": -3.300286293029785, "loss": 0.4882, "rewards/accuracies": 1.0, "rewards/chosen": 1.0381897687911987, "rewards/margins": 0.4630453586578369, "rewards/rejected": 0.5751444101333618, "step": 520 }, { "epoch": 0.28, "learning_rate": 9.957709872181647e-08, "logits/chosen": -1.988217830657959, "logits/rejected": -1.9885761737823486, "logps/chosen": -2.704421281814575, "logps/rejected": -1.1022347211837769, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": 0.8056597113609314, "rewards/margins": 0.04711318016052246, "rewards/rejected": 0.7585465312004089, "step": 521 }, { "epoch": 0.28, "learning_rate": 9.957425973751696e-08, "logits/chosen": -1.9753080606460571, "logits/rejected": -2.2601144313812256, "logps/chosen": -0.7487569451332092, "logps/rejected": -0.7541095614433289, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.7716926336288452, "rewards/margins": 0.00567394495010376, "rewards/rejected": 0.7660186886787415, "step": 522 }, { "epoch": 0.28, "learning_rate": 9.957141129659925e-08, "logits/chosen": -2.1482439041137695, "logits/rejected": -2.149813175201416, "logps/chosen": -4.961142539978027, "logps/rejected": -3.4558098316192627, "loss": 0.5424, "rewards/accuracies": 1.0, "rewards/chosen": 0.9394392967224121, "rewards/margins": 0.3282162547111511, "rewards/rejected": 0.611223042011261, "step": 523 }, { "epoch": 0.28, "learning_rate": 9.956855339960668e-08, "logits/chosen": -2.130626916885376, "logits/rejected": -2.3149752616882324, "logps/chosen": -5.4895830154418945, "logps/rejected": -5.31840705871582, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.6574304699897766, "rewards/margins": 0.0136946439743042, "rewards/rejected": 0.6437358260154724, "step": 524 }, { "epoch": 0.28, "learning_rate": 9.956568604708442e-08, "logits/chosen": -2.116914749145508, "logits/rejected": -2.282477378845215, "logps/chosen": -0.7783966660499573, "logps/rejected": -0.8442702889442444, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.8857212066650391, "rewards/margins": 0.029864788055419922, "rewards/rejected": 0.8558564186096191, "step": 525 }, { "epoch": 0.28, "learning_rate": 9.956280923957944e-08, "logits/chosen": -2.073228359222412, "logits/rejected": -2.234081506729126, "logps/chosen": -3.099299192428589, "logps/rejected": -3.045893907546997, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.6683639883995056, "rewards/margins": 0.0003110170364379883, "rewards/rejected": 0.6680529713630676, "step": 526 }, { "epoch": 0.28, "learning_rate": 9.955992297764053e-08, "logits/chosen": -2.0614349842071533, "logits/rejected": -2.052762746810913, "logps/chosen": -12.943922996520996, "logps/rejected": -1.811079978942871, "loss": 0.6605, "rewards/accuracies": 1.0, "rewards/chosen": 0.9907973408699036, "rewards/margins": 0.06637847423553467, "rewards/rejected": 0.9244188666343689, "step": 527 }, { "epoch": 0.28, "learning_rate": 9.955702726181822e-08, "logits/chosen": -2.111879825592041, "logits/rejected": -2.0920655727386475, "logps/chosen": -16.647741317749023, "logps/rejected": -5.450404644012451, "loss": 0.7086, "rewards/accuracies": 0.0, "rewards/chosen": 0.6517854928970337, "rewards/margins": -0.030624985694885254, "rewards/rejected": 0.682410478591919, "step": 528 }, { "epoch": 0.29, "learning_rate": 9.95541220926649e-08, "logits/chosen": -2.1190998554229736, "logits/rejected": -2.2433435916900635, "logps/chosen": -3.897563934326172, "logps/rejected": -3.8979268074035645, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 0.48621055483818054, "rewards/margins": -0.013362914323806763, "rewards/rejected": 0.4995734691619873, "step": 529 }, { "epoch": 0.29, "learning_rate": 9.955120747073477e-08, "logits/chosen": -2.07974910736084, "logits/rejected": -2.2774903774261475, "logps/chosen": -3.4250495433807373, "logps/rejected": -3.2804980278015137, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9167184233665466, "rewards/margins": 0.001481771469116211, "rewards/rejected": 0.9152366518974304, "step": 530 }, { "epoch": 0.29, "learning_rate": 9.954828339658381e-08, "logits/chosen": -2.063483476638794, "logits/rejected": -1.9752367734909058, "logps/chosen": -38.083404541015625, "logps/rejected": -2.4185683727264404, "loss": 0.5804, "rewards/accuracies": 1.0, "rewards/chosen": 0.7525928616523743, "rewards/margins": 0.23980188369750977, "rewards/rejected": 0.5127909779548645, "step": 531 }, { "epoch": 0.29, "learning_rate": 9.954534987076978e-08, "logits/chosen": -2.080315351486206, "logits/rejected": -2.2327725887298584, "logps/chosen": -4.898715972900391, "logps/rejected": -4.885457992553711, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.7960914969444275, "rewards/margins": -0.004227876663208008, "rewards/rejected": 0.8003193736076355, "step": 532 }, { "epoch": 0.29, "learning_rate": 9.954240689385229e-08, "logits/chosen": -2.159364700317383, "logits/rejected": -2.1312355995178223, "logps/chosen": -12.27098274230957, "logps/rejected": -17.17816734313965, "loss": 0.5252, "rewards/accuracies": 1.0, "rewards/chosen": 0.6212732195854187, "rewards/margins": 0.3699744939804077, "rewards/rejected": 0.251298725605011, "step": 533 }, { "epoch": 0.29, "learning_rate": 9.953945446639273e-08, "logits/chosen": -2.2206907272338867, "logits/rejected": -2.2164294719696045, "logps/chosen": -5.718854904174805, "logps/rejected": -2.19513201713562, "loss": 0.5884, "rewards/accuracies": 1.0, "rewards/chosen": 0.916854202747345, "rewards/margins": 0.2217891812324524, "rewards/rejected": 0.6950650215148926, "step": 534 }, { "epoch": 0.29, "learning_rate": 9.953649258895431e-08, "logits/chosen": -2.067300319671631, "logits/rejected": -2.2579872608184814, "logps/chosen": -2.2217557430267334, "logps/rejected": -2.060276508331299, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.7126602530479431, "rewards/margins": 0.009224295616149902, "rewards/rejected": 0.7034359574317932, "step": 535 }, { "epoch": 0.29, "learning_rate": 9.9533521262102e-08, "logits/chosen": -2.0999763011932373, "logits/rejected": -2.0970146656036377, "logps/chosen": -10.78906536102295, "logps/rejected": -3.148242950439453, "loss": 0.4824, "rewards/accuracies": 1.0, "rewards/chosen": 1.0180503129959106, "rewards/margins": 0.47817671298980713, "rewards/rejected": 0.5398736000061035, "step": 536 }, { "epoch": 0.29, "learning_rate": 9.953054048640261e-08, "logits/chosen": -2.1246654987335205, "logits/rejected": -2.1252124309539795, "logps/chosen": -1.335614800453186, "logps/rejected": -6.435144901275635, "loss": 0.4942, "rewards/accuracies": 1.0, "rewards/chosen": 0.8774798512458801, "rewards/margins": 0.44755789637565613, "rewards/rejected": 0.429921954870224, "step": 537 }, { "epoch": 0.29, "learning_rate": 9.952755026242477e-08, "logits/chosen": -1.981451392173767, "logits/rejected": -1.9816112518310547, "logps/chosen": -1.187565565109253, "logps/rejected": -3.0560107231140137, "loss": 0.5893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9126855134963989, "rewards/margins": 0.21977561712265015, "rewards/rejected": 0.6929098963737488, "step": 538 }, { "epoch": 0.29, "learning_rate": 9.952455059073884e-08, "logits/chosen": -2.0168659687042236, "logits/rejected": -2.0149013996124268, "logps/chosen": -1.9611821174621582, "logps/rejected": -4.091604232788086, "loss": 0.5534, "rewards/accuracies": 1.0, "rewards/chosen": 0.8341051340103149, "rewards/margins": 0.30216294527053833, "rewards/rejected": 0.5319421887397766, "step": 539 }, { "epoch": 0.29, "learning_rate": 9.952154147191705e-08, "logits/chosen": -2.032888889312744, "logits/rejected": -2.0237221717834473, "logps/chosen": -5.324394226074219, "logps/rejected": -5.751420021057129, "loss": 0.5634, "rewards/accuracies": 1.0, "rewards/chosen": 1.0674771070480347, "rewards/margins": 0.27888238430023193, "rewards/rejected": 0.7885947227478027, "step": 540 }, { "epoch": 0.29, "learning_rate": 9.951852290653339e-08, "logits/chosen": -2.016594171524048, "logits/rejected": -2.3404245376586914, "logps/chosen": -1.657785415649414, "logps/rejected": -1.484261155128479, "loss": 0.7194, "rewards/accuracies": 0.0, "rewards/chosen": 0.919378399848938, "rewards/margins": -0.05177038908004761, "rewards/rejected": 0.9711487889289856, "step": 541 }, { "epoch": 0.29, "learning_rate": 9.951549489516372e-08, "logits/chosen": -2.032038450241089, "logits/rejected": -2.300426721572876, "logps/chosen": -1.3664846420288086, "logps/rejected": -1.2513116598129272, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.7873436212539673, "rewards/margins": 0.024180471897125244, "rewards/rejected": 0.763163149356842, "step": 542 }, { "epoch": 0.29, "learning_rate": 9.951245743838561e-08, "logits/chosen": -2.083066463470459, "logits/rejected": -2.0828423500061035, "logps/chosen": -1.9642205238342285, "logps/rejected": -1.8496730327606201, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.7736438512802124, "rewards/margins": 0.025759458541870117, "rewards/rejected": 0.7478843927383423, "step": 543 }, { "epoch": 0.29, "learning_rate": 9.950941053677848e-08, "logits/chosen": -2.1068623065948486, "logits/rejected": -2.2892370223999023, "logps/chosen": -8.81527328491211, "logps/rejected": -36.11506652832031, "loss": 0.3664, "rewards/accuracies": 1.0, "rewards/chosen": 1.0761383771896362, "rewards/margins": 0.8153119087219238, "rewards/rejected": 0.2608264982700348, "step": 544 }, { "epoch": 0.29, "learning_rate": 9.950635419092357e-08, "logits/chosen": -2.0213167667388916, "logits/rejected": -2.264803409576416, "logps/chosen": -1.2137837409973145, "logps/rejected": -1.122135043144226, "loss": 0.6697, "rewards/accuracies": 1.0, "rewards/chosen": 0.8820716738700867, "rewards/margins": 0.047525882720947266, "rewards/rejected": 0.8345457911491394, "step": 545 }, { "epoch": 0.29, "learning_rate": 9.950328840140386e-08, "logits/chosen": -2.013974666595459, "logits/rejected": -2.2482378482818604, "logps/chosen": -0.861533522605896, "logps/rejected": -0.9041919708251953, "loss": 0.7032, "rewards/accuracies": 0.0, "rewards/chosen": 0.8245663642883301, "rewards/margins": -0.02006429433822632, "rewards/rejected": 0.8446306586265564, "step": 546 }, { "epoch": 0.3, "learning_rate": 9.95002131688042e-08, "logits/chosen": -2.167156934738159, "logits/rejected": -2.3420157432556152, "logps/chosen": -1.8637596368789673, "logps/rejected": -2.043606996536255, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.9320859313011169, "rewards/margins": 0.027170002460479736, "rewards/rejected": 0.9049159288406372, "step": 547 }, { "epoch": 0.3, "learning_rate": 9.949712849371121e-08, "logits/chosen": -1.9638323783874512, "logits/rejected": -2.270054578781128, "logps/chosen": -0.7405050992965698, "logps/rejected": -0.7767194509506226, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.8571738600730896, "rewards/margins": 0.024383962154388428, "rewards/rejected": 0.8327898979187012, "step": 548 }, { "epoch": 0.3, "learning_rate": 9.949403437671329e-08, "logits/chosen": -2.0536694526672363, "logits/rejected": -2.05539870262146, "logps/chosen": -1.2133729457855225, "logps/rejected": -4.322612285614014, "loss": 0.5622, "rewards/accuracies": 1.0, "rewards/chosen": 0.7653955817222595, "rewards/margins": 0.2816055119037628, "rewards/rejected": 0.4837900698184967, "step": 549 }, { "epoch": 0.3, "learning_rate": 9.94909308184007e-08, "logits/chosen": -2.065415382385254, "logits/rejected": -2.0593948364257812, "logps/chosen": -4.213314533233643, "logps/rejected": -5.57557487487793, "loss": 0.5238, "rewards/accuracies": 1.0, "rewards/chosen": 1.125425934791565, "rewards/margins": 0.3733797073364258, "rewards/rejected": 0.7520462274551392, "step": 550 }, { "epoch": 0.3, "learning_rate": 9.948781781936542e-08, "logits/chosen": -2.0417873859405518, "logits/rejected": -2.1834828853607178, "logps/chosen": -2.260389804840088, "logps/rejected": -2.3755135536193848, "loss": 0.6646, "rewards/accuracies": 1.0, "rewards/chosen": 0.7141013741493225, "rewards/margins": 0.05788451433181763, "rewards/rejected": 0.6562168598175049, "step": 551 }, { "epoch": 0.3, "learning_rate": 9.94846953802013e-08, "logits/chosen": -2.1269965171813965, "logits/rejected": -2.1415328979492188, "logps/chosen": -1.2524209022521973, "logps/rejected": -8.54343318939209, "loss": 0.518, "rewards/accuracies": 1.0, "rewards/chosen": 0.8878582119941711, "rewards/margins": 0.3877377510070801, "rewards/rejected": 0.5001204609870911, "step": 552 }, { "epoch": 0.3, "learning_rate": 9.948156350150397e-08, "logits/chosen": -2.190007209777832, "logits/rejected": -2.094714879989624, "logps/chosen": -31.947357177734375, "logps/rejected": -3.713283061981201, "loss": 0.5928, "rewards/accuracies": 1.0, "rewards/chosen": 0.798858642578125, "rewards/margins": 0.2118217945098877, "rewards/rejected": 0.5870368480682373, "step": 553 }, { "epoch": 0.3, "learning_rate": 9.947842218387086e-08, "logits/chosen": -1.969305157661438, "logits/rejected": -2.2201685905456543, "logps/chosen": -1.5803191661834717, "logps/rejected": -1.6041467189788818, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.8416289687156677, "rewards/margins": 0.017185211181640625, "rewards/rejected": 0.8244437575340271, "step": 554 }, { "epoch": 0.3, "learning_rate": 9.947527142790118e-08, "logits/chosen": -2.037531614303589, "logits/rejected": -2.3286564350128174, "logps/chosen": -1.0093767642974854, "logps/rejected": -0.9627143144607544, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.7900263071060181, "rewards/margins": 0.018551766872406006, "rewards/rejected": 0.7714745402336121, "step": 555 }, { "epoch": 0.3, "learning_rate": 9.947211123419594e-08, "logits/chosen": -1.9969719648361206, "logits/rejected": -1.986156702041626, "logps/chosen": -17.180377960205078, "logps/rejected": -6.116176605224609, "loss": 0.488, "rewards/accuracies": 1.0, "rewards/chosen": 1.3885974884033203, "rewards/margins": 0.4635007977485657, "rewards/rejected": 0.9250966906547546, "step": 556 }, { "epoch": 0.3, "learning_rate": 9.946894160335802e-08, "logits/chosen": -2.0244364738464355, "logits/rejected": -2.031836748123169, "logps/chosen": -4.076132774353027, "logps/rejected": -2.783405303955078, "loss": 0.4249, "rewards/accuracies": 1.0, "rewards/chosen": 1.261074185371399, "rewards/margins": 0.635895848274231, "rewards/rejected": 0.625178337097168, "step": 557 }, { "epoch": 0.3, "learning_rate": 9.946576253599202e-08, "logits/chosen": -2.153394937515259, "logits/rejected": -2.3458251953125, "logps/chosen": -12.021696090698242, "logps/rejected": -11.344212532043457, "loss": 0.7103, "rewards/accuracies": 0.0, "rewards/chosen": 0.2060178816318512, "rewards/margins": -0.034037113189697266, "rewards/rejected": 0.24005499482154846, "step": 558 }, { "epoch": 0.3, "learning_rate": 9.946257403270435e-08, "logits/chosen": -2.080538272857666, "logits/rejected": -2.0860297679901123, "logps/chosen": -1.2730348110198975, "logps/rejected": -4.7589921951293945, "loss": 0.6101, "rewards/accuracies": 1.0, "rewards/chosen": 0.8541569113731384, "rewards/margins": 0.17354023456573486, "rewards/rejected": 0.6806166768074036, "step": 559 }, { "epoch": 0.3, "learning_rate": 9.945937609410327e-08, "logits/chosen": -2.0966169834136963, "logits/rejected": -2.318603992462158, "logps/chosen": -3.5064902305603027, "logps/rejected": -3.427025079727173, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.8680046200752258, "rewards/margins": 0.03426569700241089, "rewards/rejected": 0.8337389230728149, "step": 560 }, { "epoch": 0.3, "learning_rate": 9.94561687207988e-08, "logits/chosen": -2.187748432159424, "logits/rejected": -2.165410041809082, "logps/chosen": -5.995175361633301, "logps/rejected": -7.3707685470581055, "loss": 0.5182, "rewards/accuracies": 1.0, "rewards/chosen": 0.9489486813545227, "rewards/margins": 0.3871126174926758, "rewards/rejected": 0.5618360638618469, "step": 561 }, { "epoch": 0.3, "learning_rate": 9.945295191340275e-08, "logits/chosen": -2.1839914321899414, "logits/rejected": -2.1091253757476807, "logps/chosen": -45.285926818847656, "logps/rejected": -0.9123550057411194, "loss": 0.7354, "rewards/accuracies": 0.0, "rewards/chosen": 0.8049117922782898, "rewards/margins": -0.08281326293945312, "rewards/rejected": 0.8877250552177429, "step": 562 }, { "epoch": 0.3, "learning_rate": 9.944972567252876e-08, "logits/chosen": -2.0452990531921387, "logits/rejected": -2.240797996520996, "logps/chosen": -1.5509833097457886, "logps/rejected": -2.3152408599853516, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.8297691345214844, "rewards/margins": -0.014151036739349365, "rewards/rejected": 0.8439201712608337, "step": 563 }, { "epoch": 0.3, "learning_rate": 9.944648999879226e-08, "logits/chosen": -2.139417886734009, "logits/rejected": -2.0478742122650146, "logps/chosen": -43.06439208984375, "logps/rejected": -4.585479736328125, "loss": 0.6243, "rewards/accuracies": 1.0, "rewards/chosen": 0.8523971438407898, "rewards/margins": 0.14288020133972168, "rewards/rejected": 0.7095169425010681, "step": 564 }, { "epoch": 0.3, "learning_rate": 9.944324489281048e-08, "logits/chosen": -2.1014223098754883, "logits/rejected": -2.302828311920166, "logps/chosen": -0.806088924407959, "logps/rejected": -0.7301624417304993, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8477005362510681, "rewards/margins": 0.008622884750366211, "rewards/rejected": 0.8390776515007019, "step": 565 }, { "epoch": 0.31, "learning_rate": 9.943999035520243e-08, "logits/chosen": -2.0456812381744385, "logits/rejected": -2.0432841777801514, "logps/chosen": -4.339764595031738, "logps/rejected": -3.984503746032715, "loss": 0.5393, "rewards/accuracies": 1.0, "rewards/chosen": 1.0385483503341675, "rewards/margins": 0.3357277512550354, "rewards/rejected": 0.7028205990791321, "step": 566 }, { "epoch": 0.31, "learning_rate": 9.943672638658895e-08, "logits/chosen": -2.135150194168091, "logits/rejected": -2.255113124847412, "logps/chosen": -6.390439510345459, "logps/rejected": -6.393844127655029, "loss": 0.6494, "rewards/accuracies": 1.0, "rewards/chosen": 0.5358096957206726, "rewards/margins": 0.08958324790000916, "rewards/rejected": 0.44622644782066345, "step": 567 }, { "epoch": 0.31, "learning_rate": 9.943345298759265e-08, "logits/chosen": -1.997646450996399, "logits/rejected": -2.2637219429016113, "logps/chosen": -1.822072982788086, "logps/rejected": -1.869816541671753, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.8084335327148438, "rewards/margins": 0.02554422616958618, "rewards/rejected": 0.7828893065452576, "step": 568 }, { "epoch": 0.31, "learning_rate": 9.943017015883795e-08, "logits/chosen": -2.0642755031585693, "logits/rejected": -2.059169292449951, "logps/chosen": -7.133476257324219, "logps/rejected": -4.167699813842773, "loss": 0.5248, "rewards/accuracies": 1.0, "rewards/chosen": 0.8122676014900208, "rewards/margins": 0.37082186341285706, "rewards/rejected": 0.4414457380771637, "step": 569 }, { "epoch": 0.31, "learning_rate": 9.94268779009511e-08, "logits/chosen": -2.18210768699646, "logits/rejected": -2.124272346496582, "logps/chosen": -34.31785583496094, "logps/rejected": -1.9964449405670166, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": 0.9961086511611938, "rewards/margins": 0.3696931004524231, "rewards/rejected": 0.6264155507087708, "step": 570 }, { "epoch": 0.31, "learning_rate": 9.942357621456009e-08, "logits/chosen": -2.0579001903533936, "logits/rejected": -2.2988061904907227, "logps/chosen": -2.3217973709106445, "logps/rejected": -2.485440492630005, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.9494110345840454, "rewards/margins": 0.03317135572433472, "rewards/rejected": 0.9162396788597107, "step": 571 }, { "epoch": 0.31, "learning_rate": 9.942026510029476e-08, "logits/chosen": -1.8863564729690552, "logits/rejected": -2.306688070297241, "logps/chosen": -1.4612356424331665, "logps/rejected": -1.5072977542877197, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 1.0081140995025635, "rewards/margins": -0.007373452186584473, "rewards/rejected": 1.015487551689148, "step": 572 }, { "epoch": 0.31, "learning_rate": 9.941694455878669e-08, "logits/chosen": -2.1779892444610596, "logits/rejected": -2.274867296218872, "logps/chosen": -8.379006385803223, "logps/rejected": -3.6208484172821045, "loss": 0.8072, "rewards/accuracies": 0.0, "rewards/chosen": 0.37869730591773987, "rewards/margins": -0.2163318693637848, "rewards/rejected": 0.5950291752815247, "step": 573 }, { "epoch": 0.31, "learning_rate": 9.941361459066934e-08, "logits/chosen": -2.094487428665161, "logits/rejected": -2.1134743690490723, "logps/chosen": -3.301787853240967, "logps/rejected": -3.1303467750549316, "loss": 0.6259, "rewards/accuracies": 1.0, "rewards/chosen": 0.9297254681587219, "rewards/margins": 0.1393771767616272, "rewards/rejected": 0.7903482913970947, "step": 574 }, { "epoch": 0.31, "learning_rate": 9.94102751965779e-08, "logits/chosen": -2.0639758110046387, "logits/rejected": -2.0677452087402344, "logps/chosen": -1.6867976188659668, "logps/rejected": -2.493177890777588, "loss": 0.5565, "rewards/accuracies": 1.0, "rewards/chosen": 0.8469274640083313, "rewards/margins": 0.29504770040512085, "rewards/rejected": 0.5518797636032104, "step": 575 }, { "epoch": 0.31, "learning_rate": 9.940692637714938e-08, "logits/chosen": -1.9605591297149658, "logits/rejected": -2.2409188747406006, "logps/chosen": -0.6534920334815979, "logps/rejected": -0.6606695055961609, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 0.8673725128173828, "rewards/margins": 0.049754440784454346, "rewards/rejected": 0.8176180720329285, "step": 576 }, { "epoch": 0.31, "learning_rate": 9.94035681330226e-08, "logits/chosen": -2.0556836128234863, "logits/rejected": -2.0671236515045166, "logps/chosen": -8.115076065063477, "logps/rejected": -9.830408096313477, "loss": 0.3867, "rewards/accuracies": 1.0, "rewards/chosen": 1.3389981985092163, "rewards/margins": 0.7505391836166382, "rewards/rejected": 0.5884590148925781, "step": 577 }, { "epoch": 0.31, "learning_rate": 9.940020046483817e-08, "logits/chosen": -2.1116459369659424, "logits/rejected": -2.1201202869415283, "logps/chosen": -3.052983283996582, "logps/rejected": -2.5230274200439453, "loss": 0.5325, "rewards/accuracies": 1.0, "rewards/chosen": 1.0467833280563354, "rewards/margins": 0.3521459102630615, "rewards/rejected": 0.6946374177932739, "step": 578 }, { "epoch": 0.31, "learning_rate": 9.939682337323847e-08, "logits/chosen": -2.153839349746704, "logits/rejected": -2.1177303791046143, "logps/chosen": -39.37741470336914, "logps/rejected": -14.97985553741455, "loss": 0.8069, "rewards/accuracies": 0.0, "rewards/chosen": 0.5552188754081726, "rewards/margins": -0.21586084365844727, "rewards/rejected": 0.7710797190666199, "step": 579 }, { "epoch": 0.31, "learning_rate": 9.939343685886775e-08, "logits/chosen": -2.0330448150634766, "logits/rejected": -2.2394590377807617, "logps/chosen": -0.9289242029190063, "logps/rejected": -0.9418590664863586, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.9687137603759766, "rewards/margins": -0.0022296905517578125, "rewards/rejected": 0.9709434509277344, "step": 580 }, { "epoch": 0.31, "learning_rate": 9.939004092237195e-08, "logits/chosen": -2.0653393268585205, "logits/rejected": -2.0656144618988037, "logps/chosen": -9.163806915283203, "logps/rejected": -6.994215965270996, "loss": 0.7076, "rewards/accuracies": 0.0, "rewards/chosen": 0.9133674502372742, "rewards/margins": -0.02863287925720215, "rewards/rejected": 0.9420003294944763, "step": 581 }, { "epoch": 0.31, "learning_rate": 9.93866355643989e-08, "logits/chosen": -2.1449241638183594, "logits/rejected": -2.194464921951294, "logps/chosen": -8.308252334594727, "logps/rejected": -9.927637100219727, "loss": 0.5649, "rewards/accuracies": 1.0, "rewards/chosen": 0.9970130920410156, "rewards/margins": 0.27529239654541016, "rewards/rejected": 0.7217206954956055, "step": 582 }, { "epoch": 0.31, "learning_rate": 9.93832207855982e-08, "logits/chosen": -2.040872097015381, "logits/rejected": -2.238647222518921, "logps/chosen": -1.108536720275879, "logps/rejected": -1.168553352355957, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9087602496147156, "rewards/margins": 0.03045511245727539, "rewards/rejected": 0.8783051371574402, "step": 583 }, { "epoch": 0.31, "learning_rate": 9.937979658662124e-08, "logits/chosen": -2.115678310394287, "logits/rejected": -2.1396238803863525, "logps/chosen": -11.262210845947266, "logps/rejected": -17.576074600219727, "loss": 0.5557, "rewards/accuracies": 1.0, "rewards/chosen": 1.02926766872406, "rewards/margins": 0.2969098687171936, "rewards/rejected": 0.7323578000068665, "step": 584 }, { "epoch": 0.32, "learning_rate": 9.93763629681212e-08, "logits/chosen": -2.086880683898926, "logits/rejected": -2.0946154594421387, "logps/chosen": -3.2438507080078125, "logps/rejected": -2.3218345642089844, "loss": 0.5375, "rewards/accuracies": 1.0, "rewards/chosen": 1.0688097476959229, "rewards/margins": 0.3401479125022888, "rewards/rejected": 0.728661835193634, "step": 585 }, { "epoch": 0.32, "learning_rate": 9.937291993075306e-08, "logits/chosen": -2.0071423053741455, "logits/rejected": -2.268556833267212, "logps/chosen": -5.9462890625, "logps/rejected": -2.5088932514190674, "loss": 0.7409, "rewards/accuracies": 0.0, "rewards/chosen": 0.8511685729026794, "rewards/margins": -0.09328526258468628, "rewards/rejected": 0.9444538354873657, "step": 586 }, { "epoch": 0.32, "learning_rate": 9.936946747517362e-08, "logits/chosen": -2.0116360187530518, "logits/rejected": -2.250241756439209, "logps/chosen": -1.2725298404693604, "logps/rejected": -1.288236379623413, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.8089391589164734, "rewards/margins": 0.009983539581298828, "rewards/rejected": 0.7989556193351746, "step": 587 }, { "epoch": 0.32, "learning_rate": 9.936600560204145e-08, "logits/chosen": -2.1439387798309326, "logits/rejected": -2.113772392272949, "logps/chosen": -28.062210083007812, "logps/rejected": -3.336355686187744, "loss": 0.4054, "rewards/accuracies": 1.0, "rewards/chosen": 1.2866051197052002, "rewards/margins": 0.6933771967887878, "rewards/rejected": 0.5932279229164124, "step": 588 }, { "epoch": 0.32, "learning_rate": 9.936253431201691e-08, "logits/chosen": -2.0559425354003906, "logits/rejected": -2.046734094619751, "logps/chosen": -6.487293720245361, "logps/rejected": -6.041163444519043, "loss": 0.4464, "rewards/accuracies": 1.0, "rewards/chosen": 0.983013927936554, "rewards/margins": 0.575020432472229, "rewards/rejected": 0.40799352526664734, "step": 589 }, { "epoch": 0.32, "learning_rate": 9.93590536057622e-08, "logits/chosen": -2.1045095920562744, "logits/rejected": -2.0895280838012695, "logps/chosen": -43.277584075927734, "logps/rejected": -27.134605407714844, "loss": 0.5994, "rewards/accuracies": 1.0, "rewards/chosen": 0.9463634490966797, "rewards/margins": 0.1971927285194397, "rewards/rejected": 0.74917072057724, "step": 590 }, { "epoch": 0.32, "learning_rate": 9.935556348394126e-08, "logits/chosen": -2.138592004776001, "logits/rejected": -2.249253034591675, "logps/chosen": -6.012698650360107, "logps/rejected": -5.635072708129883, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.7359570860862732, "rewards/margins": 0.004628002643585205, "rewards/rejected": 0.731329083442688, "step": 591 }, { "epoch": 0.32, "learning_rate": 9.935206394721988e-08, "logits/chosen": -2.1521871089935303, "logits/rejected": -2.2932889461517334, "logps/chosen": -1.9820857048034668, "logps/rejected": -1.9756478071212769, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.7671384215354919, "rewards/margins": -0.005696773529052734, "rewards/rejected": 0.7728351950645447, "step": 592 }, { "epoch": 0.32, "learning_rate": 9.93485549962656e-08, "logits/chosen": -2.0473952293395996, "logits/rejected": -2.223325490951538, "logps/chosen": -0.8983561396598816, "logps/rejected": -0.9340792894363403, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.7517295479774475, "rewards/margins": 0.01181483268737793, "rewards/rejected": 0.7399147152900696, "step": 593 }, { "epoch": 0.32, "learning_rate": 9.934503663174778e-08, "logits/chosen": -2.152515411376953, "logits/rejected": -2.2138237953186035, "logps/chosen": -5.773149013519287, "logps/rejected": -3.4523191452026367, "loss": 0.7102, "rewards/accuracies": 0.0, "rewards/chosen": 0.718572199344635, "rewards/margins": -0.0338364839553833, "rewards/rejected": 0.7524086833000183, "step": 594 }, { "epoch": 0.32, "learning_rate": 9.934150885433758e-08, "logits/chosen": -2.1587610244750977, "logits/rejected": -2.1529810428619385, "logps/chosen": -5.864860534667969, "logps/rejected": -5.491591453552246, "loss": 0.6551, "rewards/accuracies": 1.0, "rewards/chosen": 0.665497899055481, "rewards/margins": 0.07757186889648438, "rewards/rejected": 0.5879260301589966, "step": 595 }, { "epoch": 0.32, "learning_rate": 9.933797166470794e-08, "logits/chosen": -2.1486144065856934, "logits/rejected": -2.178183078765869, "logps/chosen": -1.3266065120697021, "logps/rejected": -10.017745018005371, "loss": 0.6169, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427253603935242, "rewards/margins": 0.15870046615600586, "rewards/rejected": 0.7840248942375183, "step": 596 }, { "epoch": 0.32, "learning_rate": 9.93344250635336e-08, "logits/chosen": -2.0211329460144043, "logits/rejected": -2.013277769088745, "logps/chosen": -11.958293914794922, "logps/rejected": -1.4381670951843262, "loss": 0.6494, "rewards/accuracies": 1.0, "rewards/chosen": 0.9327301383018494, "rewards/margins": 0.08948349952697754, "rewards/rejected": 0.8432466387748718, "step": 597 }, { "epoch": 0.32, "learning_rate": 9.933086905149111e-08, "logits/chosen": -2.0381875038146973, "logits/rejected": -2.3119821548461914, "logps/chosen": -1.193152666091919, "logps/rejected": -3.251389503479004, "loss": 0.5563, "rewards/accuracies": 1.0, "rewards/chosen": 0.8686102032661438, "rewards/margins": 0.2953926920890808, "rewards/rejected": 0.573217511177063, "step": 598 }, { "epoch": 0.32, "learning_rate": 9.932730362925878e-08, "logits/chosen": -2.0016748905181885, "logits/rejected": -1.9985966682434082, "logps/chosen": -10.169502258300781, "logps/rejected": -3.310986042022705, "loss": 0.4269, "rewards/accuracies": 1.0, "rewards/chosen": 1.1532834768295288, "rewards/margins": 0.6300470232963562, "rewards/rejected": 0.5232364535331726, "step": 599 }, { "epoch": 0.32, "learning_rate": 9.932372879751675e-08, "logits/chosen": -2.035386800765991, "logits/rejected": -2.228454351425171, "logps/chosen": -1.0857996940612793, "logps/rejected": -1.0623215436935425, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.8830728530883789, "rewards/margins": 0.0051628947257995605, "rewards/rejected": 0.8779099583625793, "step": 600 }, { "epoch": 0.32, "learning_rate": 9.932014455694698e-08, "logits/chosen": -1.991107702255249, "logits/rejected": -2.0005125999450684, "logps/chosen": -2.7956979274749756, "logps/rejected": -10.265403747558594, "loss": 0.6511, "rewards/accuracies": 1.0, "rewards/chosen": 0.8304367065429688, "rewards/margins": 0.0859302282333374, "rewards/rejected": 0.7445064783096313, "step": 601 }, { "epoch": 0.32, "learning_rate": 9.931655090823311e-08, "logits/chosen": -1.9745310544967651, "logits/rejected": -2.2191691398620605, "logps/chosen": -4.084176063537598, "logps/rejected": -4.01047945022583, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.8390805125236511, "rewards/margins": 0.015555620193481445, "rewards/rejected": 0.8235248923301697, "step": 602 }, { "epoch": 0.33, "learning_rate": 9.93129478520607e-08, "logits/chosen": -2.0722217559814453, "logits/rejected": -2.0756123065948486, "logps/chosen": -3.455936908721924, "logps/rejected": -3.136927366256714, "loss": 0.5179, "rewards/accuracies": 1.0, "rewards/chosen": 0.9940834045410156, "rewards/margins": 0.3878716230392456, "rewards/rejected": 0.60621178150177, "step": 603 }, { "epoch": 0.33, "learning_rate": 9.930933538911708e-08, "logits/chosen": -2.0706706047058105, "logits/rejected": -2.3187127113342285, "logps/chosen": -1.515453577041626, "logps/rejected": -1.4257283210754395, "loss": 0.7082, "rewards/accuracies": 0.0, "rewards/chosen": 0.7793699502944946, "rewards/margins": -0.029951632022857666, "rewards/rejected": 0.8093215823173523, "step": 604 }, { "epoch": 0.33, "learning_rate": 9.93057135200913e-08, "logits/chosen": -2.0747437477111816, "logits/rejected": -2.045644998550415, "logps/chosen": -16.714162826538086, "logps/rejected": -3.848785877227783, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 1.1101633310317993, "rewards/margins": 0.4241308569908142, "rewards/rejected": 0.6860324740409851, "step": 605 }, { "epoch": 0.33, "learning_rate": 9.930208224567427e-08, "logits/chosen": -2.1208102703094482, "logits/rejected": -2.0666043758392334, "logps/chosen": -41.40364456176758, "logps/rejected": -9.563127517700195, "loss": 0.5578, "rewards/accuracies": 1.0, "rewards/chosen": 1.034406304359436, "rewards/margins": 0.2919059991836548, "rewards/rejected": 0.7425003051757812, "step": 606 }, { "epoch": 0.33, "learning_rate": 9.92984415665587e-08, "logits/chosen": -2.1820409297943115, "logits/rejected": -2.1811695098876953, "logps/chosen": -1.4592697620391846, "logps/rejected": -4.216004371643066, "loss": 0.5105, "rewards/accuracies": 1.0, "rewards/chosen": 0.9952242970466614, "rewards/margins": 0.40630847215652466, "rewards/rejected": 0.5889158248901367, "step": 607 }, { "epoch": 0.33, "learning_rate": 9.929479148343906e-08, "logits/chosen": -2.046895742416382, "logits/rejected": -2.2591781616210938, "logps/chosen": -2.4745681285858154, "logps/rejected": -2.6489429473876953, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.9905497431755066, "rewards/margins": -0.007451355457305908, "rewards/rejected": 0.9980010986328125, "step": 608 }, { "epoch": 0.33, "learning_rate": 9.929113199701163e-08, "logits/chosen": -2.0274546146392822, "logits/rejected": -2.02872371673584, "logps/chosen": -4.667678356170654, "logps/rejected": -3.151982545852661, "loss": 0.5962, "rewards/accuracies": 1.0, "rewards/chosen": 0.9026274085044861, "rewards/margins": 0.20440983772277832, "rewards/rejected": 0.6982175707817078, "step": 609 }, { "epoch": 0.33, "learning_rate": 9.928746310797447e-08, "logits/chosen": -2.021627187728882, "logits/rejected": -2.212061643600464, "logps/chosen": -1.1339337825775146, "logps/rejected": -1.1619986295700073, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.8015322685241699, "rewards/margins": 0.007928729057312012, "rewards/rejected": 0.7936035394668579, "step": 610 }, { "epoch": 0.33, "learning_rate": 9.928378481702746e-08, "logits/chosen": -1.9701638221740723, "logits/rejected": -1.9710685014724731, "logps/chosen": -2.687164783477783, "logps/rejected": -1.4637620449066162, "loss": 0.6506, "rewards/accuracies": 1.0, "rewards/chosen": 0.8612037897109985, "rewards/margins": 0.0869402289390564, "rewards/rejected": 0.7742635607719421, "step": 611 }, { "epoch": 0.33, "learning_rate": 9.928009712487226e-08, "logits/chosen": -2.0628955364227295, "logits/rejected": -2.255202531814575, "logps/chosen": -0.7844420671463013, "logps/rejected": -0.7642381191253662, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.822706401348114, "rewards/margins": 0.028492212295532227, "rewards/rejected": 0.7942141890525818, "step": 612 }, { "epoch": 0.33, "learning_rate": 9.927640003221231e-08, "logits/chosen": -2.0775163173675537, "logits/rejected": -2.0895678997039795, "logps/chosen": -16.123079299926758, "logps/rejected": -6.251269340515137, "loss": 0.4581, "rewards/accuracies": 1.0, "rewards/chosen": 1.2823429107666016, "rewards/margins": 0.5428704023361206, "rewards/rejected": 0.739472508430481, "step": 613 }, { "epoch": 0.33, "learning_rate": 9.927269353975288e-08, "logits/chosen": -2.045621156692505, "logits/rejected": -2.051398754119873, "logps/chosen": -2.235219717025757, "logps/rejected": -3.8613808155059814, "loss": 0.5187, "rewards/accuracies": 1.0, "rewards/chosen": 0.947936475276947, "rewards/margins": 0.38578176498413086, "rewards/rejected": 0.5621547102928162, "step": 614 }, { "epoch": 0.33, "learning_rate": 9.926897764820095e-08, "logits/chosen": -2.0431528091430664, "logits/rejected": -2.2290353775024414, "logps/chosen": -1.8150053024291992, "logps/rejected": -1.8541338443756104, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9971145987510681, "rewards/margins": 0.020200252532958984, "rewards/rejected": 0.9769143462181091, "step": 615 }, { "epoch": 0.33, "learning_rate": 9.92652523582654e-08, "logits/chosen": -2.0340311527252197, "logits/rejected": -2.034498453140259, "logps/chosen": -0.8478711843490601, "logps/rejected": -2.8726890087127686, "loss": 0.5797, "rewards/accuracies": 1.0, "rewards/chosen": 0.8849958777427673, "rewards/margins": 0.24151849746704102, "rewards/rejected": 0.6434773802757263, "step": 616 }, { "epoch": 0.33, "learning_rate": 9.926151767065685e-08, "logits/chosen": -2.0534567832946777, "logits/rejected": -2.285862445831299, "logps/chosen": -7.191434383392334, "logps/rejected": -2.1760146617889404, "loss": 0.7576, "rewards/accuracies": 0.0, "rewards/chosen": 0.8723102807998657, "rewards/margins": -0.12496882677078247, "rewards/rejected": 0.9972791075706482, "step": 617 }, { "epoch": 0.33, "learning_rate": 9.925777358608772e-08, "logits/chosen": -2.0476574897766113, "logits/rejected": -2.0473554134368896, "logps/chosen": -7.235691070556641, "logps/rejected": -6.500668525695801, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 1.0950239896774292, "rewards/margins": 0.14996761083602905, "rewards/rejected": 0.9450563788414001, "step": 618 }, { "epoch": 0.33, "learning_rate": 9.92540201052722e-08, "logits/chosen": -2.1095287799835205, "logits/rejected": -2.1113040447235107, "logps/chosen": -4.176448345184326, "logps/rejected": -1.8015422821044922, "loss": 0.7039, "rewards/accuracies": 0.0, "rewards/chosen": 0.90566486120224, "rewards/margins": -0.02134263515472412, "rewards/rejected": 0.9270074963569641, "step": 619 }, { "epoch": 0.33, "learning_rate": 9.925025722892628e-08, "logits/chosen": -2.101624011993408, "logits/rejected": -2.3087356090545654, "logps/chosen": -2.287537097930908, "logps/rejected": -18.811864852905273, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 1.0899566411972046, "rewards/margins": 0.3556172847747803, "rewards/rejected": 0.7343393564224243, "step": 620 }, { "epoch": 0.33, "learning_rate": 9.92464849577678e-08, "logits/chosen": -2.0036017894744873, "logits/rejected": -2.0169084072113037, "logps/chosen": -36.55059814453125, "logps/rejected": -21.492279052734375, "loss": 0.8093, "rewards/accuracies": 0.0, "rewards/chosen": 0.13645592331886292, "rewards/margins": -0.2201780378818512, "rewards/rejected": 0.3566339612007141, "step": 621 }, { "epoch": 0.34, "learning_rate": 9.92427032925163e-08, "logits/chosen": -1.9902116060256958, "logits/rejected": -1.9896022081375122, "logps/chosen": -6.516794204711914, "logps/rejected": -14.432868957519531, "loss": 0.5327, "rewards/accuracies": 1.0, "rewards/chosen": 0.8006948828697205, "rewards/margins": 0.35166114568710327, "rewards/rejected": 0.4490337371826172, "step": 622 }, { "epoch": 0.34, "learning_rate": 9.923891223389318e-08, "logits/chosen": -2.0659401416778564, "logits/rejected": -2.2941153049468994, "logps/chosen": -18.80475616455078, "logps/rejected": -14.461065292358398, "loss": 0.7421, "rewards/accuracies": 0.0, "rewards/chosen": 0.6256400942802429, "rewards/margins": -0.09565567970275879, "rewards/rejected": 0.7212957739830017, "step": 623 }, { "epoch": 0.34, "learning_rate": 9.92351117826216e-08, "logits/chosen": -2.0670011043548584, "logits/rejected": -2.3068063259124756, "logps/chosen": -2.3312859535217285, "logps/rejected": -2.0686235427856445, "loss": 0.7074, "rewards/accuracies": 0.0, "rewards/chosen": 0.8005949258804321, "rewards/margins": -0.028310954570770264, "rewards/rejected": 0.8289058804512024, "step": 624 }, { "epoch": 0.34, "learning_rate": 9.923130193942652e-08, "logits/chosen": -1.921531081199646, "logits/rejected": -2.1732780933380127, "logps/chosen": -4.2610602378845215, "logps/rejected": -1.3792206048965454, "loss": 0.7231, "rewards/accuracies": 0.0, "rewards/chosen": 0.7601035833358765, "rewards/margins": -0.05894970893859863, "rewards/rejected": 0.8190532922744751, "step": 625 }, { "epoch": 0.34, "learning_rate": 9.922748270503471e-08, "logits/chosen": -2.1738638877868652, "logits/rejected": -2.2786290645599365, "logps/chosen": -11.191308975219727, "logps/rejected": -32.91429901123047, "loss": 0.4897, "rewards/accuracies": 1.0, "rewards/chosen": 0.8840600848197937, "rewards/margins": 0.4592672288417816, "rewards/rejected": 0.4247928559780121, "step": 626 }, { "epoch": 0.34, "learning_rate": 9.922365408017472e-08, "logits/chosen": -2.075578451156616, "logits/rejected": -2.2722325325012207, "logps/chosen": -5.085997104644775, "logps/rejected": -5.116426467895508, "loss": 0.7528, "rewards/accuracies": 0.0, "rewards/chosen": 0.617658793926239, "rewards/margins": -0.11595308780670166, "rewards/rejected": 0.7336118817329407, "step": 627 }, { "epoch": 0.34, "learning_rate": 9.921981606557686e-08, "logits/chosen": -2.0146079063415527, "logits/rejected": -2.0144999027252197, "logps/chosen": -0.8912578821182251, "logps/rejected": -2.620014190673828, "loss": 0.5781, "rewards/accuracies": 1.0, "rewards/chosen": 0.9759525656700134, "rewards/margins": 0.24515467882156372, "rewards/rejected": 0.7307978868484497, "step": 628 }, { "epoch": 0.34, "learning_rate": 9.921596866197326e-08, "logits/chosen": -2.0445542335510254, "logits/rejected": -2.043877124786377, "logps/chosen": -0.9239912033081055, "logps/rejected": -2.5769364833831787, "loss": 0.568, "rewards/accuracies": 1.0, "rewards/chosen": 0.9533605575561523, "rewards/margins": 0.2682279348373413, "rewards/rejected": 0.685132622718811, "step": 629 }, { "epoch": 0.34, "learning_rate": 9.921211187009783e-08, "logits/chosen": -2.04488468170166, "logits/rejected": -2.04805850982666, "logps/chosen": -6.485246181488037, "logps/rejected": -1.0789588689804077, "loss": 0.5938, "rewards/accuracies": 1.0, "rewards/chosen": 1.0977739095687866, "rewards/margins": 0.20973020792007446, "rewards/rejected": 0.8880437016487122, "step": 630 }, { "epoch": 0.34, "learning_rate": 9.920824569068631e-08, "logits/chosen": -2.163914442062378, "logits/rejected": -2.253648281097412, "logps/chosen": -6.087140083312988, "logps/rejected": -1.7023022174835205, "loss": 0.7645, "rewards/accuracies": 0.0, "rewards/chosen": 0.729120671749115, "rewards/margins": -0.13799738883972168, "rewards/rejected": 0.8671180605888367, "step": 631 }, { "epoch": 0.34, "learning_rate": 9.920437012447618e-08, "logits/chosen": -2.1709561347961426, "logits/rejected": -2.214346170425415, "logps/chosen": -1.276200532913208, "logps/rejected": -1.3091814517974854, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.7824187278747559, "rewards/margins": 0.008820831775665283, "rewards/rejected": 0.7735978960990906, "step": 632 }, { "epoch": 0.34, "learning_rate": 9.920048517220672e-08, "logits/chosen": -2.0597965717315674, "logits/rejected": -2.2269484996795654, "logps/chosen": -3.158665418624878, "logps/rejected": -7.003093719482422, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": 0.5724913477897644, "rewards/margins": 0.1542740762233734, "rewards/rejected": 0.418217271566391, "step": 633 }, { "epoch": 0.34, "learning_rate": 9.919659083461904e-08, "logits/chosen": -2.139291524887085, "logits/rejected": -2.2993998527526855, "logps/chosen": -10.4844970703125, "logps/rejected": -10.156749725341797, "loss": 0.704, "rewards/accuracies": 0.0, "rewards/chosen": 0.7868953943252563, "rewards/margins": -0.021684467792510986, "rewards/rejected": 0.8085798621177673, "step": 634 }, { "epoch": 0.34, "learning_rate": 9.919268711245598e-08, "logits/chosen": -2.136631488800049, "logits/rejected": -2.2505178451538086, "logps/chosen": -1.61542546749115, "logps/rejected": -4.603644847869873, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 0.9041582942008972, "rewards/margins": 0.03384828567504883, "rewards/rejected": 0.8703100085258484, "step": 635 }, { "epoch": 0.34, "learning_rate": 9.918877400646222e-08, "logits/chosen": -2.114739179611206, "logits/rejected": -2.3090291023254395, "logps/chosen": -6.910811901092529, "logps/rejected": -6.537664413452148, "loss": 0.6499, "rewards/accuracies": 1.0, "rewards/chosen": 0.8811756372451782, "rewards/margins": 0.08842843770980835, "rewards/rejected": 0.7927471995353699, "step": 636 }, { "epoch": 0.34, "learning_rate": 9.91848515173842e-08, "logits/chosen": -2.1392335891723633, "logits/rejected": -2.230527639389038, "logps/chosen": -2.810239315032959, "logps/rejected": -3.205766201019287, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 0.665138304233551, "rewards/margins": 0.05134660005569458, "rewards/rejected": 0.6137917041778564, "step": 637 }, { "epoch": 0.34, "learning_rate": 9.918091964597016e-08, "logits/chosen": -2.0894198417663574, "logits/rejected": -1.9814281463623047, "logps/chosen": -23.691650390625, "logps/rejected": -11.593206405639648, "loss": 0.6577, "rewards/accuracies": 1.0, "rewards/chosen": 0.8737110495567322, "rewards/margins": 0.07215750217437744, "rewards/rejected": 0.8015535473823547, "step": 638 }, { "epoch": 0.34, "learning_rate": 9.917697839297015e-08, "logits/chosen": -2.104360580444336, "logits/rejected": -1.9267996549606323, "logps/chosen": -48.285892486572266, "logps/rejected": -2.1575121879577637, "loss": 0.5099, "rewards/accuracies": 1.0, "rewards/chosen": 1.1050900220870972, "rewards/margins": 0.4077083468437195, "rewards/rejected": 0.6973816752433777, "step": 639 }, { "epoch": 0.35, "learning_rate": 9.917302775913596e-08, "logits/chosen": -2.0332000255584717, "logits/rejected": -2.0386595726013184, "logps/chosen": -12.356903076171875, "logps/rejected": -6.431344032287598, "loss": 0.4849, "rewards/accuracies": 1.0, "rewards/chosen": 1.2656601667404175, "rewards/margins": 0.47158879041671753, "rewards/rejected": 0.7940713763237, "step": 640 }, { "epoch": 0.35, "learning_rate": 9.916906774522123e-08, "logits/chosen": -2.058368682861328, "logits/rejected": -2.0620479583740234, "logps/chosen": -4.388847351074219, "logps/rejected": -0.5800524950027466, "loss": 0.5749, "rewards/accuracies": 1.0, "rewards/chosen": 1.1200673580169678, "rewards/margins": 0.25235819816589355, "rewards/rejected": 0.8677091598510742, "step": 641 }, { "epoch": 0.35, "learning_rate": 9.916509835198132e-08, "logits/chosen": -2.043304204940796, "logits/rejected": -2.0505521297454834, "logps/chosen": -2.3018431663513184, "logps/rejected": -3.8963851928710938, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 1.151633858680725, "rewards/margins": 0.4704384207725525, "rewards/rejected": 0.6811954379081726, "step": 642 }, { "epoch": 0.35, "learning_rate": 9.916111958017346e-08, "logits/chosen": -1.9491705894470215, "logits/rejected": -2.1991302967071533, "logps/chosen": -1.041721224784851, "logps/rejected": -1.0769001245498657, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8320650458335876, "rewards/margins": 0.01788121461868286, "rewards/rejected": 0.8141838312149048, "step": 643 }, { "epoch": 0.35, "learning_rate": 9.915713143055662e-08, "logits/chosen": -2.0917603969573975, "logits/rejected": -2.060403347015381, "logps/chosen": -7.001653671264648, "logps/rejected": -4.639501094818115, "loss": 0.48, "rewards/accuracies": 1.0, "rewards/chosen": 1.0338267087936401, "rewards/margins": 0.4844285845756531, "rewards/rejected": 0.5493981242179871, "step": 644 }, { "epoch": 0.35, "learning_rate": 9.915313390389152e-08, "logits/chosen": -2.0153026580810547, "logits/rejected": -2.2178497314453125, "logps/chosen": -1.5639402866363525, "logps/rejected": -1.6370893716812134, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.6812410354614258, "rewards/margins": 0.03291887044906616, "rewards/rejected": 0.6483221650123596, "step": 645 }, { "epoch": 0.35, "learning_rate": 9.914912700094078e-08, "logits/chosen": -2.220301866531372, "logits/rejected": -2.221710681915283, "logps/chosen": -1.9805771112442017, "logps/rejected": -1.6054280996322632, "loss": 0.5483, "rewards/accuracies": 1.0, "rewards/chosen": 0.9722103476524353, "rewards/margins": 0.31420111656188965, "rewards/rejected": 0.6580092310905457, "step": 646 }, { "epoch": 0.35, "learning_rate": 9.914511072246871e-08, "logits/chosen": -2.1107332706451416, "logits/rejected": -2.1067051887512207, "logps/chosen": -2.5667922496795654, "logps/rejected": -10.072078704833984, "loss": 0.5725, "rewards/accuracies": 1.0, "rewards/chosen": 1.041002631187439, "rewards/margins": 0.25778597593307495, "rewards/rejected": 0.783216655254364, "step": 647 }, { "epoch": 0.35, "learning_rate": 9.914108506924144e-08, "logits/chosen": -2.0198793411254883, "logits/rejected": -2.0217723846435547, "logps/chosen": -6.492705821990967, "logps/rejected": -2.511038064956665, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 1.3355845212936401, "rewards/margins": 0.762373149394989, "rewards/rejected": 0.5732113718986511, "step": 648 }, { "epoch": 0.35, "learning_rate": 9.91370500420269e-08, "logits/chosen": -2.117786407470703, "logits/rejected": -2.1131913661956787, "logps/chosen": -3.2361156940460205, "logps/rejected": -4.304546356201172, "loss": 0.8201, "rewards/accuracies": 0.0, "rewards/chosen": 0.6905705332756042, "rewards/margins": -0.23964208364486694, "rewards/rejected": 0.9302126169204712, "step": 649 }, { "epoch": 0.35, "learning_rate": 9.913300564159476e-08, "logits/chosen": -2.0447301864624023, "logits/rejected": -2.0436136722564697, "logps/chosen": -1.7651071548461914, "logps/rejected": -1.6667660474777222, "loss": 0.6556, "rewards/accuracies": 1.0, "rewards/chosen": 0.9143385291099548, "rewards/margins": 0.07659304141998291, "rewards/rejected": 0.8377454876899719, "step": 650 }, { "epoch": 0.35, "learning_rate": 9.912895186871659e-08, "logits/chosen": -2.046030282974243, "logits/rejected": -2.047717809677124, "logps/chosen": -1.1292155981063843, "logps/rejected": -4.031348705291748, "loss": 0.5736, "rewards/accuracies": 1.0, "rewards/chosen": 0.90467768907547, "rewards/margins": 0.2554208040237427, "rewards/rejected": 0.6492568850517273, "step": 651 }, { "epoch": 0.35, "learning_rate": 9.912488872416562e-08, "logits/chosen": -1.9707905054092407, "logits/rejected": -1.9776530265808105, "logps/chosen": -3.5018486976623535, "logps/rejected": -4.131696701049805, "loss": 0.451, "rewards/accuracies": 1.0, "rewards/chosen": 1.037445306777954, "rewards/margins": 0.5624169111251831, "rewards/rejected": 0.4750284254550934, "step": 652 }, { "epoch": 0.35, "learning_rate": 9.912081620871692e-08, "logits/chosen": -2.125917673110962, "logits/rejected": -2.112671375274658, "logps/chosen": -10.329215049743652, "logps/rejected": -3.2509164810180664, "loss": 0.5741, "rewards/accuracies": 1.0, "rewards/chosen": 1.0116124153137207, "rewards/margins": 0.25425225496292114, "rewards/rejected": 0.7573601603507996, "step": 653 }, { "epoch": 0.35, "learning_rate": 9.911673432314739e-08, "logits/chosen": -2.096151113510132, "logits/rejected": -2.100362539291382, "logps/chosen": -3.950355291366577, "logps/rejected": -6.777316093444824, "loss": 0.5237, "rewards/accuracies": 1.0, "rewards/chosen": 0.9677538275718689, "rewards/margins": 0.3736775517463684, "rewards/rejected": 0.5940762758255005, "step": 654 }, { "epoch": 0.35, "learning_rate": 9.911264306823564e-08, "logits/chosen": -2.1505346298217773, "logits/rejected": -2.2233283519744873, "logps/chosen": -25.226709365844727, "logps/rejected": -2.139688014984131, "loss": 1.0037, "rewards/accuracies": 0.0, "rewards/chosen": 0.45659199357032776, "rewards/margins": -0.5470951795578003, "rewards/rejected": 1.0036871433258057, "step": 655 }, { "epoch": 0.35, "learning_rate": 9.91085424447621e-08, "logits/chosen": -2.0610668659210205, "logits/rejected": -2.2456843852996826, "logps/chosen": -1.4718159437179565, "logps/rejected": -3.6735823154449463, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 0.800542950630188, "rewards/margins": 0.13257712125778198, "rewards/rejected": 0.667965829372406, "step": 656 }, { "epoch": 0.35, "learning_rate": 9.910443245350903e-08, "logits/chosen": -2.103581190109253, "logits/rejected": -2.2936112880706787, "logps/chosen": -4.071319580078125, "logps/rejected": -2.393724203109741, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0373904705047607, "rewards/margins": 0.018563151359558105, "rewards/rejected": 1.0188273191452026, "step": 657 }, { "epoch": 0.35, "learning_rate": 9.910031309526039e-08, "logits/chosen": -1.9870986938476562, "logits/rejected": -2.0003669261932373, "logps/chosen": -2.2915847301483154, "logps/rejected": -7.510568618774414, "loss": 0.5315, "rewards/accuracies": 1.0, "rewards/chosen": 1.0809365510940552, "rewards/margins": 0.3546094298362732, "rewards/rejected": 0.726327121257782, "step": 658 }, { "epoch": 0.36, "learning_rate": 9.909618437080202e-08, "logits/chosen": -1.992440104484558, "logits/rejected": -1.9983750581741333, "logps/chosen": -3.0703625679016113, "logps/rejected": -2.355870246887207, "loss": 0.5469, "rewards/accuracies": 1.0, "rewards/chosen": 1.0948972702026367, "rewards/margins": 0.317533016204834, "rewards/rejected": 0.7773642539978027, "step": 659 }, { "epoch": 0.36, "learning_rate": 9.909204628092147e-08, "logits/chosen": -1.9419969320297241, "logits/rejected": -2.2482590675354004, "logps/chosen": -0.5705732107162476, "logps/rejected": -0.6212806105613708, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9079472422599792, "rewards/margins": 0.012630701065063477, "rewards/rejected": 0.8953165411949158, "step": 660 }, { "epoch": 0.36, "learning_rate": 9.908789882640811e-08, "logits/chosen": -2.128342390060425, "logits/rejected": -2.1082942485809326, "logps/chosen": -13.644512176513672, "logps/rejected": -3.786681652069092, "loss": 0.5376, "rewards/accuracies": 1.0, "rewards/chosen": 0.9135408401489258, "rewards/margins": 0.33973199129104614, "rewards/rejected": 0.5738088488578796, "step": 661 }, { "epoch": 0.36, "learning_rate": 9.908374200805312e-08, "logits/chosen": -1.998172402381897, "logits/rejected": -1.993906021118164, "logps/chosen": -7.01711368560791, "logps/rejected": -1.9170758724212646, "loss": 0.5637, "rewards/accuracies": 1.0, "rewards/chosen": 1.1002780199050903, "rewards/margins": 0.27826833724975586, "rewards/rejected": 0.8220096826553345, "step": 662 }, { "epoch": 0.36, "learning_rate": 9.907957582664941e-08, "logits/chosen": -2.035691976547241, "logits/rejected": -2.2600836753845215, "logps/chosen": -2.044455051422119, "logps/rejected": -2.6354362964630127, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 0.810616135597229, "rewards/margins": -0.013415396213531494, "rewards/rejected": 0.8240315318107605, "step": 663 }, { "epoch": 0.36, "learning_rate": 9.907540028299172e-08, "logits/chosen": -2.1555066108703613, "logits/rejected": -2.097242593765259, "logps/chosen": -23.74456024169922, "logps/rejected": -9.039959907531738, "loss": 0.3643, "rewards/accuracies": 1.0, "rewards/chosen": 1.3233222961425781, "rewards/margins": 0.8220911622047424, "rewards/rejected": 0.5012311339378357, "step": 664 }, { "epoch": 0.36, "learning_rate": 9.907121537787656e-08, "logits/chosen": -2.127068281173706, "logits/rejected": -2.12715482711792, "logps/chosen": -2.6185078620910645, "logps/rejected": -1.8330118656158447, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": 0.9090067148208618, "rewards/margins": 0.03897327184677124, "rewards/rejected": 0.8700334429740906, "step": 665 }, { "epoch": 0.36, "learning_rate": 9.906702111210224e-08, "logits/chosen": -1.9435617923736572, "logits/rejected": -1.9542207717895508, "logps/chosen": -2.838502883911133, "logps/rejected": -3.4220995903015137, "loss": 0.5829, "rewards/accuracies": 1.0, "rewards/chosen": 0.7482103705406189, "rewards/margins": 0.23407292366027832, "rewards/rejected": 0.5141374468803406, "step": 666 }, { "epoch": 0.36, "learning_rate": 9.906281748646883e-08, "logits/chosen": -2.135129690170288, "logits/rejected": -2.254396915435791, "logps/chosen": -3.0057902336120605, "logps/rejected": -1.2112624645233154, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.8823705911636353, "rewards/margins": 0.04514247179031372, "rewards/rejected": 0.8372281193733215, "step": 667 }, { "epoch": 0.36, "learning_rate": 9.905860450177821e-08, "logits/chosen": -2.1894969940185547, "logits/rejected": -2.1699206829071045, "logps/chosen": -21.80642318725586, "logps/rejected": -2.1359870433807373, "loss": 0.575, "rewards/accuracies": 1.0, "rewards/chosen": 0.8792724609375, "rewards/margins": 0.25206977128982544, "rewards/rejected": 0.6272026896476746, "step": 668 }, { "epoch": 0.36, "learning_rate": 9.905438215883402e-08, "logits/chosen": -2.112596273422241, "logits/rejected": -2.126737117767334, "logps/chosen": -4.5825300216674805, "logps/rejected": -3.66227388381958, "loss": 0.495, "rewards/accuracies": 1.0, "rewards/chosen": 1.117660641670227, "rewards/margins": 0.4455358386039734, "rewards/rejected": 0.6721248030662537, "step": 669 }, { "epoch": 0.36, "learning_rate": 9.905015045844171e-08, "logits/chosen": -1.9500267505645752, "logits/rejected": -2.234003782272339, "logps/chosen": -0.6234943270683289, "logps/rejected": -0.6990639567375183, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.9033128619194031, "rewards/margins": 0.02568233013153076, "rewards/rejected": 0.8776305317878723, "step": 670 }, { "epoch": 0.36, "learning_rate": 9.904590940140852e-08, "logits/chosen": -2.139756679534912, "logits/rejected": -2.1416194438934326, "logps/chosen": -0.9799417853355408, "logps/rejected": -3.039477586746216, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": 0.7796900272369385, "rewards/margins": 0.24951106309890747, "rewards/rejected": 0.530178964138031, "step": 671 }, { "epoch": 0.36, "learning_rate": 9.904165898854342e-08, "logits/chosen": -2.0375161170959473, "logits/rejected": -2.237140417098999, "logps/chosen": -3.6923060417175293, "logps/rejected": -3.593078136444092, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.4455147385597229, "rewards/margins": 0.013246357440948486, "rewards/rejected": 0.4322683811187744, "step": 672 }, { "epoch": 0.36, "learning_rate": 9.903739922065724e-08, "logits/chosen": -1.9503769874572754, "logits/rejected": -1.9478216171264648, "logps/chosen": -7.578834533691406, "logps/rejected": -4.141637325286865, "loss": 0.4041, "rewards/accuracies": 1.0, "rewards/chosen": 1.3215487003326416, "rewards/margins": 0.6971659660339355, "rewards/rejected": 0.624382734298706, "step": 673 }, { "epoch": 0.36, "learning_rate": 9.903313009856253e-08, "logits/chosen": -2.195000410079956, "logits/rejected": -2.2612571716308594, "logps/chosen": -1.7148144245147705, "logps/rejected": -2.430617332458496, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.9493032693862915, "rewards/margins": 0.03037738800048828, "rewards/rejected": 0.9189258813858032, "step": 674 }, { "epoch": 0.36, "learning_rate": 9.90288516230737e-08, "logits/chosen": -1.954736351966858, "logits/rejected": -2.2684671878814697, "logps/chosen": -2.181161880493164, "logps/rejected": -2.197056531906128, "loss": 0.7136, "rewards/accuracies": 0.0, "rewards/chosen": 0.9053950309753418, "rewards/margins": -0.04048413038253784, "rewards/rejected": 0.9458791613578796, "step": 675 }, { "epoch": 0.36, "learning_rate": 9.902456379500684e-08, "logits/chosen": -2.1291751861572266, "logits/rejected": -2.2528228759765625, "logps/chosen": -4.167261600494385, "logps/rejected": -0.8574548959732056, "loss": 0.7105, "rewards/accuracies": 0.0, "rewards/chosen": 0.907875657081604, "rewards/margins": -0.03440624475479126, "rewards/rejected": 0.9422819018363953, "step": 676 }, { "epoch": 0.37, "learning_rate": 9.902026661517992e-08, "logits/chosen": -1.9747562408447266, "logits/rejected": -1.9834734201431274, "logps/chosen": -2.386409282684326, "logps/rejected": -3.519967794418335, "loss": 0.4784, "rewards/accuracies": 1.0, "rewards/chosen": 1.0738598108291626, "rewards/margins": 0.48869192600250244, "rewards/rejected": 0.5851678848266602, "step": 677 }, { "epoch": 0.37, "learning_rate": 9.901596008441265e-08, "logits/chosen": -2.0712451934814453, "logits/rejected": -1.986618995666504, "logps/chosen": -11.555731773376465, "logps/rejected": -4.061558723449707, "loss": 0.6147, "rewards/accuracies": 1.0, "rewards/chosen": 1.0339511632919312, "rewards/margins": 0.16359585523605347, "rewards/rejected": 0.8703553080558777, "step": 678 }, { "epoch": 0.37, "learning_rate": 9.901164420352652e-08, "logits/chosen": -2.1550958156585693, "logits/rejected": -2.1553783416748047, "logps/chosen": -3.1963939666748047, "logps/rejected": -8.637011528015137, "loss": 0.6038, "rewards/accuracies": 1.0, "rewards/chosen": 1.0212363004684448, "rewards/margins": 0.18743109703063965, "rewards/rejected": 0.8338052034378052, "step": 679 }, { "epoch": 0.37, "learning_rate": 9.900731897334483e-08, "logits/chosen": -1.9999874830245972, "logits/rejected": -2.2281877994537354, "logps/chosen": -2.530888795852661, "logps/rejected": -2.414449453353882, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 0.593941867351532, "rewards/margins": -0.008935213088989258, "rewards/rejected": 0.6028770804405212, "step": 680 }, { "epoch": 0.37, "learning_rate": 9.90029843946926e-08, "logits/chosen": -2.100074291229248, "logits/rejected": -2.1222472190856934, "logps/chosen": -11.748071670532227, "logps/rejected": -3.161865711212158, "loss": 0.5071, "rewards/accuracies": 1.0, "rewards/chosen": 1.1795262098312378, "rewards/margins": 0.4147101640701294, "rewards/rejected": 0.7648160457611084, "step": 681 }, { "epoch": 0.37, "learning_rate": 9.899864046839673e-08, "logits/chosen": -2.106468915939331, "logits/rejected": -2.2767491340637207, "logps/chosen": -4.785285472869873, "logps/rejected": -1.1945127248764038, "loss": 0.7175, "rewards/accuracies": 0.0, "rewards/chosen": 0.8347505927085876, "rewards/margins": -0.0480838418006897, "rewards/rejected": 0.8828344345092773, "step": 682 }, { "epoch": 0.37, "learning_rate": 9.899428719528584e-08, "logits/chosen": -2.1312780380249023, "logits/rejected": -2.1334784030914307, "logps/chosen": -0.7012277841567993, "logps/rejected": -2.566903591156006, "loss": 0.5309, "rewards/accuracies": 1.0, "rewards/chosen": 0.9752092361450195, "rewards/margins": 0.356093168258667, "rewards/rejected": 0.6191160678863525, "step": 683 }, { "epoch": 0.37, "learning_rate": 9.898992457619033e-08, "logits/chosen": -2.1063730716705322, "logits/rejected": -2.2096052169799805, "logps/chosen": -1.3314460515975952, "logps/rejected": -1.515655279159546, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683282732963562, "rewards/margins": 0.02173173427581787, "rewards/rejected": 0.8465965390205383, "step": 684 }, { "epoch": 0.37, "learning_rate": 9.898555261194241e-08, "logits/chosen": -2.1161882877349854, "logits/rejected": -2.3068830966949463, "logps/chosen": -2.1833419799804688, "logps/rejected": -2.1201844215393066, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.5928375124931335, "rewards/margins": 0.020960509777069092, "rewards/rejected": 0.5718770027160645, "step": 685 }, { "epoch": 0.37, "learning_rate": 9.898117130337608e-08, "logits/chosen": -2.155729055404663, "logits/rejected": -2.1570851802825928, "logps/chosen": -1.173111081123352, "logps/rejected": -2.0622360706329346, "loss": 0.5333, "rewards/accuracies": 1.0, "rewards/chosen": 1.040513277053833, "rewards/margins": 0.35029977560043335, "rewards/rejected": 0.6902135014533997, "step": 686 }, { "epoch": 0.37, "learning_rate": 9.897678065132707e-08, "logits/chosen": -2.1226627826690674, "logits/rejected": -2.1190717220306396, "logps/chosen": -12.693109512329102, "logps/rejected": -2.4599812030792236, "loss": 0.548, "rewards/accuracies": 1.0, "rewards/chosen": 1.0078881978988647, "rewards/margins": 0.3148886561393738, "rewards/rejected": 0.692999541759491, "step": 687 }, { "epoch": 0.37, "learning_rate": 9.897238065663293e-08, "logits/chosen": -2.051889657974243, "logits/rejected": -2.2793495655059814, "logps/chosen": -1.209456205368042, "logps/rejected": -1.1594386100769043, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.8133783340454102, "rewards/margins": 0.042671024799346924, "rewards/rejected": 0.7707073092460632, "step": 688 }, { "epoch": 0.37, "learning_rate": 9.896797132013301e-08, "logits/chosen": -2.0661141872406006, "logits/rejected": -2.2831480503082275, "logps/chosen": -1.2420916557312012, "logps/rejected": -1.1870782375335693, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.9055353403091431, "rewards/margins": 0.02149564027786255, "rewards/rejected": 0.8840397000312805, "step": 689 }, { "epoch": 0.37, "learning_rate": 9.896355264266841e-08, "logits/chosen": -2.0502471923828125, "logits/rejected": -2.053652286529541, "logps/chosen": -4.125946044921875, "logps/rejected": -1.1884403228759766, "loss": 0.5639, "rewards/accuracies": 1.0, "rewards/chosen": 1.181447982788086, "rewards/margins": 0.2778093218803406, "rewards/rejected": 0.9036386609077454, "step": 690 }, { "epoch": 0.37, "learning_rate": 9.895912462508202e-08, "logits/chosen": -2.133040428161621, "logits/rejected": -2.326225996017456, "logps/chosen": -1.7411248683929443, "logps/rejected": -6.0904998779296875, "loss": 0.6477, "rewards/accuracies": 1.0, "rewards/chosen": 0.8391163945198059, "rewards/margins": 0.09311932325363159, "rewards/rejected": 0.7459970712661743, "step": 691 }, { "epoch": 0.37, "learning_rate": 9.895468726821852e-08, "logits/chosen": -1.988005518913269, "logits/rejected": -1.9882299900054932, "logps/chosen": -5.5108642578125, "logps/rejected": -2.5590455532073975, "loss": 0.4608, "rewards/accuracies": 1.0, "rewards/chosen": 1.2651722431182861, "rewards/margins": 0.5354962944984436, "rewards/rejected": 0.7296759486198425, "step": 692 }, { "epoch": 0.37, "learning_rate": 9.895024057292434e-08, "logits/chosen": -2.0560052394866943, "logits/rejected": -2.240652084350586, "logps/chosen": -6.017114162445068, "logps/rejected": -1.7591595649719238, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.6476576328277588, "rewards/margins": -0.013312935829162598, "rewards/rejected": 0.6609705686569214, "step": 693 }, { "epoch": 0.37, "learning_rate": 9.894578454004773e-08, "logits/chosen": -2.0235986709594727, "logits/rejected": -2.031719923019409, "logps/chosen": -4.725224494934082, "logps/rejected": -3.366943597793579, "loss": 0.5313, "rewards/accuracies": 1.0, "rewards/chosen": 0.9109919667243958, "rewards/margins": 0.3551519513130188, "rewards/rejected": 0.555840015411377, "step": 694 }, { "epoch": 0.37, "learning_rate": 9.894131917043874e-08, "logits/chosen": -2.146682024002075, "logits/rejected": -2.1428990364074707, "logps/chosen": -6.282070159912109, "logps/rejected": -4.596118450164795, "loss": 0.3652, "rewards/accuracies": 1.0, "rewards/chosen": 1.2642024755477905, "rewards/margins": 0.8190922737121582, "rewards/rejected": 0.44511017203330994, "step": 695 }, { "epoch": 0.38, "learning_rate": 9.893684446494913e-08, "logits/chosen": -2.037633180618286, "logits/rejected": -2.040255069732666, "logps/chosen": -2.249727725982666, "logps/rejected": -0.9864677786827087, "loss": 0.64, "rewards/accuracies": 1.0, "rewards/chosen": 0.8986366391181946, "rewards/margins": 0.10931670665740967, "rewards/rejected": 0.7893199324607849, "step": 696 }, { "epoch": 0.38, "learning_rate": 9.89323604244325e-08, "logits/chosen": -2.066473960876465, "logits/rejected": -2.2279231548309326, "logps/chosen": -1.3744134902954102, "logps/rejected": -1.3098965883255005, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.7002111673355103, "rewards/margins": 0.0020006299018859863, "rewards/rejected": 0.6982105374336243, "step": 697 }, { "epoch": 0.38, "learning_rate": 9.892786704974419e-08, "logits/chosen": -2.192391872406006, "logits/rejected": -2.0819637775421143, "logps/chosen": -29.27399444580078, "logps/rejected": -11.842201232910156, "loss": 0.4573, "rewards/accuracies": 1.0, "rewards/chosen": 1.0132125616073608, "rewards/margins": 0.5451398491859436, "rewards/rejected": 0.46807271242141724, "step": 698 }, { "epoch": 0.38, "learning_rate": 9.892336434174136e-08, "logits/chosen": -2.0213754177093506, "logits/rejected": -2.2550883293151855, "logps/chosen": -0.7735536694526672, "logps/rejected": -0.8134331703186035, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.9364811182022095, "rewards/margins": 0.024610161781311035, "rewards/rejected": 0.9118709564208984, "step": 699 }, { "epoch": 0.38, "learning_rate": 9.891885230128292e-08, "logits/chosen": -1.978022813796997, "logits/rejected": -2.2467854022979736, "logps/chosen": -3.533900737762451, "logps/rejected": -0.8629516363143921, "loss": 0.7309, "rewards/accuracies": 0.0, "rewards/chosen": 0.8512725830078125, "rewards/margins": -0.0740782618522644, "rewards/rejected": 0.9253508448600769, "step": 700 }, { "epoch": 0.38, "learning_rate": 9.891433092922958e-08, "logits/chosen": -1.992197871208191, "logits/rejected": -2.2292091846466064, "logps/chosen": -1.5489894151687622, "logps/rejected": -1.5393787622451782, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.804110050201416, "rewards/margins": 0.014163434505462646, "rewards/rejected": 0.7899466156959534, "step": 701 }, { "epoch": 0.38, "learning_rate": 9.890980022644382e-08, "logits/chosen": -2.1123573780059814, "logits/rejected": -2.10475492477417, "logps/chosen": -14.62765121459961, "logps/rejected": -2.124101400375366, "loss": 0.46, "rewards/accuracies": 1.0, "rewards/chosen": 1.1838997602462769, "rewards/margins": 0.5378530025482178, "rewards/rejected": 0.6460467576980591, "step": 702 }, { "epoch": 0.38, "learning_rate": 9.89052601937899e-08, "logits/chosen": -2.1737821102142334, "logits/rejected": -2.1811017990112305, "logps/chosen": -6.772977828979492, "logps/rejected": -2.3393616676330566, "loss": 0.7369, "rewards/accuracies": 0.0, "rewards/chosen": 0.5003277063369751, "rewards/margins": -0.08561927080154419, "rewards/rejected": 0.5859469771385193, "step": 703 }, { "epoch": 0.38, "learning_rate": 9.890071083213385e-08, "logits/chosen": -2.008655071258545, "logits/rejected": -1.9804202318191528, "logps/chosen": -8.987369537353516, "logps/rejected": -3.1644184589385986, "loss": 0.4492, "rewards/accuracies": 1.0, "rewards/chosen": 1.1028988361358643, "rewards/margins": 0.5671956539154053, "rewards/rejected": 0.535703182220459, "step": 704 }, { "epoch": 0.38, "learning_rate": 9.889615214234351e-08, "logits/chosen": -2.082533597946167, "logits/rejected": -2.2337043285369873, "logps/chosen": -19.576200485229492, "logps/rejected": -6.516681671142578, "loss": 1.0749, "rewards/accuracies": 0.0, "rewards/chosen": 0.13044701516628265, "rewards/margins": -0.6573024392127991, "rewards/rejected": 0.7877494692802429, "step": 705 }, { "epoch": 0.38, "learning_rate": 9.889158412528848e-08, "logits/chosen": -1.991178274154663, "logits/rejected": -2.2396280765533447, "logps/chosen": -2.9537909030914307, "logps/rejected": -2.826409101486206, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.5155116319656372, "rewards/margins": 0.01030796766281128, "rewards/rejected": 0.5052036643028259, "step": 706 }, { "epoch": 0.38, "learning_rate": 9.888700678184012e-08, "logits/chosen": -1.9541610479354858, "logits/rejected": -2.242398977279663, "logps/chosen": -1.1856036186218262, "logps/rejected": -1.1605769395828247, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.726591169834137, "rewards/margins": 0.011210858821868896, "rewards/rejected": 0.7153803110122681, "step": 707 }, { "epoch": 0.38, "learning_rate": 9.88824201128716e-08, "logits/chosen": -2.189795732498169, "logits/rejected": -2.159064531326294, "logps/chosen": -16.242156982421875, "logps/rejected": -2.7934718132019043, "loss": 0.6178, "rewards/accuracies": 1.0, "rewards/chosen": 0.9187908172607422, "rewards/margins": 0.1567479372024536, "rewards/rejected": 0.7620428800582886, "step": 708 }, { "epoch": 0.38, "learning_rate": 9.887782411925786e-08, "logits/chosen": -2.11868953704834, "logits/rejected": -2.2704110145568848, "logps/chosen": -2.988616943359375, "logps/rejected": -2.861968755722046, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.6439309120178223, "rewards/margins": 0.01910501718521118, "rewards/rejected": 0.6248258948326111, "step": 709 }, { "epoch": 0.38, "learning_rate": 9.887321880187561e-08, "logits/chosen": -2.0066678524017334, "logits/rejected": -1.9998103380203247, "logps/chosen": -3.927696704864502, "logps/rejected": -2.761570930480957, "loss": 0.5918, "rewards/accuracies": 1.0, "rewards/chosen": 0.9768253564834595, "rewards/margins": 0.214056134223938, "rewards/rejected": 0.7627692222595215, "step": 710 }, { "epoch": 0.38, "learning_rate": 9.886860416160334e-08, "logits/chosen": -2.0612740516662598, "logits/rejected": -2.0658135414123535, "logps/chosen": -1.9736063480377197, "logps/rejected": -2.6058549880981445, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 1.0924521684646606, "rewards/margins": 0.5263599753379822, "rewards/rejected": 0.5660921931266785, "step": 711 }, { "epoch": 0.38, "learning_rate": 9.886398019932136e-08, "logits/chosen": -2.0706775188446045, "logits/rejected": -2.0727388858795166, "logps/chosen": -3.3406875133514404, "logps/rejected": -1.3572733402252197, "loss": 0.6129, "rewards/accuracies": 1.0, "rewards/chosen": 1.0675724744796753, "rewards/margins": 0.1675850749015808, "rewards/rejected": 0.8999873995780945, "step": 712 }, { "epoch": 0.38, "learning_rate": 9.885934691591166e-08, "logits/chosen": -2.0957841873168945, "logits/rejected": -2.13319993019104, "logps/chosen": -5.813270568847656, "logps/rejected": -11.935049057006836, "loss": 0.4765, "rewards/accuracies": 1.0, "rewards/chosen": 1.1028960943222046, "rewards/margins": 0.4937242269515991, "rewards/rejected": 0.6091718673706055, "step": 713 }, { "epoch": 0.39, "learning_rate": 9.885470431225813e-08, "logits/chosen": -2.0306015014648438, "logits/rejected": -2.255671501159668, "logps/chosen": -1.9713139533996582, "logps/rejected": -2.1792798042297363, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.8046558499336243, "rewards/margins": 0.006073057651519775, "rewards/rejected": 0.7985827922821045, "step": 714 }, { "epoch": 0.39, "learning_rate": 9.885005238924633e-08, "logits/chosen": -2.0013885498046875, "logits/rejected": -1.9835213422775269, "logps/chosen": -5.234731197357178, "logps/rejected": -5.612146854400635, "loss": 0.5216, "rewards/accuracies": 1.0, "rewards/chosen": 0.9647895693778992, "rewards/margins": 0.3788115978240967, "rewards/rejected": 0.5859779715538025, "step": 715 }, { "epoch": 0.39, "learning_rate": 9.884539114776367e-08, "logits/chosen": -2.0186073780059814, "logits/rejected": -2.288619041442871, "logps/chosen": -4.356498718261719, "logps/rejected": -5.967386245727539, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.8283249139785767, "rewards/margins": 0.017738640308380127, "rewards/rejected": 0.8105862736701965, "step": 716 }, { "epoch": 0.39, "learning_rate": 9.884072058869931e-08, "logits/chosen": -1.999173879623413, "logits/rejected": -2.270012855529785, "logps/chosen": -4.792370796203613, "logps/rejected": -3.3444814682006836, "loss": 0.7047, "rewards/accuracies": 0.0, "rewards/chosen": 0.5715899467468262, "rewards/margins": -0.02296745777130127, "rewards/rejected": 0.5945574045181274, "step": 717 }, { "epoch": 0.39, "learning_rate": 9.883604071294419e-08, "logits/chosen": -2.03183650970459, "logits/rejected": -2.0220954418182373, "logps/chosen": -6.30800199508667, "logps/rejected": -1.781733751296997, "loss": 0.6563, "rewards/accuracies": 1.0, "rewards/chosen": 0.7240869998931885, "rewards/margins": 0.07502102851867676, "rewards/rejected": 0.6490659713745117, "step": 718 }, { "epoch": 0.39, "learning_rate": 9.883135152139102e-08, "logits/chosen": -2.115678310394287, "logits/rejected": -2.1136391162872314, "logps/chosen": -5.517770290374756, "logps/rejected": -3.306490898132324, "loss": 0.4221, "rewards/accuracies": 1.0, "rewards/chosen": 1.1831750869750977, "rewards/margins": 0.643936812877655, "rewards/rejected": 0.5392382740974426, "step": 719 }, { "epoch": 0.39, "learning_rate": 9.88266530149343e-08, "logits/chosen": -2.0727012157440186, "logits/rejected": -2.088259696960449, "logps/chosen": -12.733600616455078, "logps/rejected": -6.556585788726807, "loss": 0.545, "rewards/accuracies": 1.0, "rewards/chosen": 1.0861704349517822, "rewards/margins": 0.32204389572143555, "rewards/rejected": 0.7641265392303467, "step": 720 }, { "epoch": 0.39, "learning_rate": 9.882194519447032e-08, "logits/chosen": -2.0367491245269775, "logits/rejected": -2.0505800247192383, "logps/chosen": -1.1717500686645508, "logps/rejected": -3.4620280265808105, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": 0.9765776991844177, "rewards/margins": 0.18049657344818115, "rewards/rejected": 0.7960811257362366, "step": 721 }, { "epoch": 0.39, "learning_rate": 9.881722806089707e-08, "logits/chosen": -2.0219600200653076, "logits/rejected": -2.2994019985198975, "logps/chosen": -0.901567280292511, "logps/rejected": -0.960149884223938, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 1.0148003101348877, "rewards/margins": 0.006762266159057617, "rewards/rejected": 1.00803804397583, "step": 722 }, { "epoch": 0.39, "learning_rate": 9.881250161511444e-08, "logits/chosen": -2.0623624324798584, "logits/rejected": -2.222093343734741, "logps/chosen": -1.1290487051010132, "logps/rejected": -1.1507863998413086, "loss": 0.6564, "rewards/accuracies": 1.0, "rewards/chosen": 0.9282709360122681, "rewards/margins": 0.07487887144088745, "rewards/rejected": 0.8533920645713806, "step": 723 }, { "epoch": 0.39, "learning_rate": 9.8807765858024e-08, "logits/chosen": -2.0127015113830566, "logits/rejected": -2.2578771114349365, "logps/chosen": -5.210254192352295, "logps/rejected": -6.803745269775391, "loss": 0.5933, "rewards/accuracies": 1.0, "rewards/chosen": 0.7813400030136108, "rewards/margins": 0.21068209409713745, "rewards/rejected": 0.5706579089164734, "step": 724 }, { "epoch": 0.39, "learning_rate": 9.880302079052912e-08, "logits/chosen": -1.9781590700149536, "logits/rejected": -2.245790719985962, "logps/chosen": -2.106074094772339, "logps/rejected": -1.996651530265808, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.646436870098114, "rewards/margins": 0.007262229919433594, "rewards/rejected": 0.6391746401786804, "step": 725 }, { "epoch": 0.39, "learning_rate": 9.879826641353497e-08, "logits/chosen": -1.9605603218078613, "logits/rejected": -2.2114486694335938, "logps/chosen": -1.8500641584396362, "logps/rejected": -1.8244929313659668, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7226571440696716, "rewards/margins": 0.017519712448120117, "rewards/rejected": 0.7051374316215515, "step": 726 }, { "epoch": 0.39, "learning_rate": 9.879350272794848e-08, "logits/chosen": -1.972143530845642, "logits/rejected": -1.9974443912506104, "logps/chosen": -8.852514266967773, "logps/rejected": -27.5513858795166, "loss": 0.6079, "rewards/accuracies": 1.0, "rewards/chosen": 1.032333254814148, "rewards/margins": 0.1783851981163025, "rewards/rejected": 0.8539480566978455, "step": 727 }, { "epoch": 0.39, "learning_rate": 9.878872973467834e-08, "logits/chosen": -2.2426202297210693, "logits/rejected": -2.241452217102051, "logps/chosen": -1.6259126663208008, "logps/rejected": -1.59272301197052, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.8415623903274536, "rewards/margins": 0.012996852397918701, "rewards/rejected": 0.8285655379295349, "step": 728 }, { "epoch": 0.39, "learning_rate": 9.878394743463502e-08, "logits/chosen": -2.0273020267486572, "logits/rejected": -2.2851874828338623, "logps/chosen": -0.7682652473449707, "logps/rejected": -0.7712074518203735, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.7843877673149109, "rewards/margins": 0.028988003730773926, "rewards/rejected": 0.755399763584137, "step": 729 }, { "epoch": 0.39, "learning_rate": 9.877915582873082e-08, "logits/chosen": -2.0480470657348633, "logits/rejected": -2.086343765258789, "logps/chosen": -6.381052494049072, "logps/rejected": -10.793054580688477, "loss": 0.4178, "rewards/accuracies": 1.0, "rewards/chosen": 1.2017830610275269, "rewards/margins": 0.65645432472229, "rewards/rejected": 0.5453287363052368, "step": 730 }, { "epoch": 0.39, "learning_rate": 9.877435491787973e-08, "logits/chosen": -2.129805326461792, "logits/rejected": -2.1258318424224854, "logps/chosen": -5.461798667907715, "logps/rejected": -5.177604675292969, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": 1.0651860237121582, "rewards/margins": 0.6668925285339355, "rewards/rejected": 0.39829349517822266, "step": 731 }, { "epoch": 0.39, "learning_rate": 9.876954470299759e-08, "logits/chosen": -2.0483756065368652, "logits/rejected": -2.239424705505371, "logps/chosen": -1.0526986122131348, "logps/rejected": -1.0115410089492798, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.7383097410202026, "rewards/margins": -0.0031018853187561035, "rewards/rejected": 0.7414116263389587, "step": 732 }, { "epoch": 0.4, "learning_rate": 9.876472518500193e-08, "logits/chosen": -2.1144447326660156, "logits/rejected": -2.254579782485962, "logps/chosen": -2.0173146724700928, "logps/rejected": -1.8839019536972046, "loss": 0.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.9851590991020203, "rewards/margins": 0.04303872585296631, "rewards/rejected": 0.942120373249054, "step": 733 }, { "epoch": 0.4, "learning_rate": 9.875989636481214e-08, "logits/chosen": -2.086167097091675, "logits/rejected": -2.237248659133911, "logps/chosen": -0.5787270069122314, "logps/rejected": -0.6749298572540283, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.7762954831123352, "rewards/margins": 0.022987842559814453, "rewards/rejected": 0.7533076405525208, "step": 734 }, { "epoch": 0.4, "learning_rate": 9.875505824334935e-08, "logits/chosen": -2.017876386642456, "logits/rejected": -2.013187885284424, "logps/chosen": -1.585234522819519, "logps/rejected": -4.090098857879639, "loss": 0.5314, "rewards/accuracies": 1.0, "rewards/chosen": 0.9787728190422058, "rewards/margins": 0.3547983169555664, "rewards/rejected": 0.6239745020866394, "step": 735 }, { "epoch": 0.4, "learning_rate": 9.875021082153646e-08, "logits/chosen": -2.014183282852173, "logits/rejected": -2.0110552310943604, "logps/chosen": -1.6607959270477295, "logps/rejected": -2.22550630569458, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8275200724601746, "rewards/margins": 0.008654475212097168, "rewards/rejected": 0.8188655972480774, "step": 736 }, { "epoch": 0.4, "learning_rate": 9.874535410029814e-08, "logits/chosen": -2.0455923080444336, "logits/rejected": -2.21431040763855, "logps/chosen": -5.015509605407715, "logps/rejected": -8.332548141479492, "loss": 0.4535, "rewards/accuracies": 1.0, "rewards/chosen": 0.8300153017044067, "rewards/margins": 0.5554280281066895, "rewards/rejected": 0.2745872437953949, "step": 737 }, { "epoch": 0.4, "learning_rate": 9.874048808056085e-08, "logits/chosen": -2.1106057167053223, "logits/rejected": -2.2659661769866943, "logps/chosen": -1.5750830173492432, "logps/rejected": -1.5735893249511719, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 0.9355757832527161, "rewards/margins": -0.009820640087127686, "rewards/rejected": 0.9453964233398438, "step": 738 }, { "epoch": 0.4, "learning_rate": 9.873561276325281e-08, "logits/chosen": -2.112581491470337, "logits/rejected": -2.310241222381592, "logps/chosen": -1.0137860774993896, "logps/rejected": -1.0033314228057861, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9220307469367981, "rewards/margins": 0.005573272705078125, "rewards/rejected": 0.91645747423172, "step": 739 }, { "epoch": 0.4, "learning_rate": 9.873072814930401e-08, "logits/chosen": -2.035567283630371, "logits/rejected": -2.230358123779297, "logps/chosen": -1.1247791051864624, "logps/rejected": -2.0310847759246826, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.8792605400085449, "rewards/margins": 0.03866797685623169, "rewards/rejected": 0.8405925631523132, "step": 740 }, { "epoch": 0.4, "learning_rate": 9.872583423964624e-08, "logits/chosen": -2.078965425491333, "logits/rejected": -2.3398537635803223, "logps/chosen": -8.737608909606934, "logps/rejected": -8.569515228271484, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 0.749086856842041, "rewards/margins": -0.016331613063812256, "rewards/rejected": 0.7654184699058533, "step": 741 }, { "epoch": 0.4, "learning_rate": 9.872093103521305e-08, "logits/chosen": -2.0928292274475098, "logits/rejected": -2.0931169986724854, "logps/chosen": -2.042696237564087, "logps/rejected": -1.277627944946289, "loss": 0.6505, "rewards/accuracies": 1.0, "rewards/chosen": 0.9178061485290527, "rewards/margins": 0.08723306655883789, "rewards/rejected": 0.8305730819702148, "step": 742 }, { "epoch": 0.4, "learning_rate": 9.871601853693974e-08, "logits/chosen": -2.0489964485168457, "logits/rejected": -2.2702198028564453, "logps/chosen": -1.0128157138824463, "logps/rejected": -0.9239560961723328, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.7468553781509399, "rewards/margins": 0.0024529695510864258, "rewards/rejected": 0.7444024085998535, "step": 743 }, { "epoch": 0.4, "learning_rate": 9.871109674576342e-08, "logits/chosen": -1.9731223583221436, "logits/rejected": -1.9303196668624878, "logps/chosen": -15.017176628112793, "logps/rejected": -2.3526649475097656, "loss": 0.6362, "rewards/accuracies": 1.0, "rewards/chosen": 0.9951003193855286, "rewards/margins": 0.11727839708328247, "rewards/rejected": 0.8778219223022461, "step": 744 }, { "epoch": 0.4, "learning_rate": 9.870616566262293e-08, "logits/chosen": -2.0086829662323, "logits/rejected": -2.0117995738983154, "logps/chosen": -5.697805881500244, "logps/rejected": -2.9011073112487793, "loss": 0.4032, "rewards/accuracies": 1.0, "rewards/chosen": 1.3410780429840088, "rewards/margins": 0.699847400188446, "rewards/rejected": 0.6412306427955627, "step": 745 }, { "epoch": 0.4, "learning_rate": 9.870122528845892e-08, "logits/chosen": -2.2116191387176514, "logits/rejected": -2.2977519035339355, "logps/chosen": -2.9647276401519775, "logps/rejected": -3.109344244003296, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.7512820363044739, "rewards/margins": 0.03249561786651611, "rewards/rejected": 0.7187864184379578, "step": 746 }, { "epoch": 0.4, "learning_rate": 9.86962756242138e-08, "logits/chosen": -2.092139720916748, "logits/rejected": -2.094542980194092, "logps/chosen": -4.638384819030762, "logps/rejected": -3.981879949569702, "loss": 0.6122, "rewards/accuracies": 1.0, "rewards/chosen": 0.9556933641433716, "rewards/margins": 0.16898423433303833, "rewards/rejected": 0.7867091298103333, "step": 747 }, { "epoch": 0.4, "learning_rate": 9.869131667083175e-08, "logits/chosen": -2.048729419708252, "logits/rejected": -2.3188955783843994, "logps/chosen": -0.850159764289856, "logps/rejected": -0.8233846426010132, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.9429637789726257, "rewards/margins": 0.0069771409034729, "rewards/rejected": 0.9359866380691528, "step": 748 }, { "epoch": 0.4, "learning_rate": 9.868634842925874e-08, "logits/chosen": -2.0441360473632812, "logits/rejected": -2.2780604362487793, "logps/chosen": -1.6972949504852295, "logps/rejected": -1.7514145374298096, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9831297993659973, "rewards/margins": 0.007721900939941406, "rewards/rejected": 0.9754078984260559, "step": 749 }, { "epoch": 0.4, "learning_rate": 9.868137090044246e-08, "logits/chosen": -2.142242670059204, "logits/rejected": -2.1387226581573486, "logps/chosen": -5.168557643890381, "logps/rejected": -2.6304659843444824, "loss": 0.436, "rewards/accuracies": 1.0, "rewards/chosen": 1.3781206607818604, "rewards/margins": 0.6043248176574707, "rewards/rejected": 0.7737958431243896, "step": 750 }, { "epoch": 0.41, "learning_rate": 9.867638408533243e-08, "logits/chosen": -2.010641098022461, "logits/rejected": -2.3365705013275146, "logps/chosen": -1.72260582447052, "logps/rejected": -1.7085850238800049, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": 0.9265483021736145, "rewards/margins": -0.0188787579536438, "rewards/rejected": 0.9454270601272583, "step": 751 }, { "epoch": 0.41, "learning_rate": 9.867138798487992e-08, "logits/chosen": -1.9619793891906738, "logits/rejected": -2.222043037414551, "logps/chosen": -0.8471782207489014, "logps/rejected": -0.8450640439987183, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.8370976448059082, "rewards/margins": 0.019464313983917236, "rewards/rejected": 0.817633330821991, "step": 752 }, { "epoch": 0.41, "learning_rate": 9.866638260003796e-08, "logits/chosen": -2.0143978595733643, "logits/rejected": -2.2380330562591553, "logps/chosen": -1.3349905014038086, "logps/rejected": -1.2295788526535034, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.9492158889770508, "rewards/margins": 0.00429457426071167, "rewards/rejected": 0.9449213147163391, "step": 753 }, { "epoch": 0.41, "learning_rate": 9.866136793176137e-08, "logits/chosen": -2.000960350036621, "logits/rejected": -2.271038293838501, "logps/chosen": -0.9720600843429565, "logps/rejected": -1.039758563041687, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.9644937515258789, "rewards/margins": 0.009421348571777344, "rewards/rejected": 0.9550724029541016, "step": 754 }, { "epoch": 0.41, "learning_rate": 9.865634398100671e-08, "logits/chosen": -2.1150131225585938, "logits/rejected": -2.1161301136016846, "logps/chosen": -2.4839038848876953, "logps/rejected": -1.1758954524993896, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": 1.032069206237793, "rewards/margins": 0.18036574125289917, "rewards/rejected": 0.8517034649848938, "step": 755 }, { "epoch": 0.41, "learning_rate": 9.865131074873234e-08, "logits/chosen": -2.192129611968994, "logits/rejected": -2.1942126750946045, "logps/chosen": -2.5613670349121094, "logps/rejected": -1.4385979175567627, "loss": 0.5844, "rewards/accuracies": 1.0, "rewards/chosen": 0.9560793042182922, "rewards/margins": 0.23076331615447998, "rewards/rejected": 0.7253159880638123, "step": 756 }, { "epoch": 0.41, "learning_rate": 9.864626823589841e-08, "logits/chosen": -2.1340394020080566, "logits/rejected": -2.124983072280884, "logps/chosen": -6.629554748535156, "logps/rejected": -3.0615103244781494, "loss": 0.5927, "rewards/accuracies": 1.0, "rewards/chosen": 0.8062207102775574, "rewards/margins": 0.2120344042778015, "rewards/rejected": 0.5941863059997559, "step": 757 }, { "epoch": 0.41, "learning_rate": 9.864121644346678e-08, "logits/chosen": -2.025568962097168, "logits/rejected": -2.2351465225219727, "logps/chosen": -1.4818415641784668, "logps/rejected": -1.7663379907608032, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": 0.7817335724830627, "rewards/margins": 0.03880053758621216, "rewards/rejected": 0.7429330348968506, "step": 758 }, { "epoch": 0.41, "learning_rate": 9.863615537240114e-08, "logits/chosen": -1.9899872541427612, "logits/rejected": -2.239748239517212, "logps/chosen": -2.216392993927002, "logps/rejected": -2.283954381942749, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.7364606261253357, "rewards/margins": 0.004542112350463867, "rewards/rejected": 0.7319185137748718, "step": 759 }, { "epoch": 0.41, "learning_rate": 9.863108502366688e-08, "logits/chosen": -2.1267731189727783, "logits/rejected": -2.1134159564971924, "logps/chosen": -11.545917510986328, "logps/rejected": -2.893320083618164, "loss": 0.4676, "rewards/accuracies": 1.0, "rewards/chosen": 1.189896821975708, "rewards/margins": 0.5172052979469299, "rewards/rejected": 0.6726915240287781, "step": 760 }, { "epoch": 0.41, "learning_rate": 9.862600539823123e-08, "logits/chosen": -2.1805684566497803, "logits/rejected": -2.1885440349578857, "logps/chosen": -3.327171564102173, "logps/rejected": -3.0218679904937744, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 1.1866520643234253, "rewards/margins": 0.6113585829734802, "rewards/rejected": 0.5752934813499451, "step": 761 }, { "epoch": 0.41, "learning_rate": 9.862091649706319e-08, "logits/chosen": -2.1050314903259277, "logits/rejected": -2.2300968170166016, "logps/chosen": -9.803681373596191, "logps/rejected": -1.467320203781128, "loss": 0.7361, "rewards/accuracies": 0.0, "rewards/chosen": 0.687462329864502, "rewards/margins": -0.08412474393844604, "rewards/rejected": 0.771587073802948, "step": 762 }, { "epoch": 0.41, "learning_rate": 9.861581832113345e-08, "logits/chosen": -2.049795150756836, "logits/rejected": -2.3016738891601562, "logps/chosen": -3.6299591064453125, "logps/rejected": -3.844174385070801, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.7148141860961914, "rewards/margins": -0.001735687255859375, "rewards/rejected": 0.7165498733520508, "step": 763 }, { "epoch": 0.41, "learning_rate": 9.861071087141455e-08, "logits/chosen": -2.084618330001831, "logits/rejected": -2.083230972290039, "logps/chosen": -5.324738502502441, "logps/rejected": -4.100966930389404, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 1.1853381395339966, "rewards/margins": 0.6114013195037842, "rewards/rejected": 0.5739368200302124, "step": 764 }, { "epoch": 0.41, "learning_rate": 9.860559414888077e-08, "logits/chosen": -2.034224271774292, "logits/rejected": -2.2842564582824707, "logps/chosen": -0.9499203562736511, "logps/rejected": -0.95167475938797, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.8766518831253052, "rewards/margins": 0.035144925117492676, "rewards/rejected": 0.8415069580078125, "step": 765 }, { "epoch": 0.41, "learning_rate": 9.860046815450813e-08, "logits/chosen": -2.0067999362945557, "logits/rejected": -2.0068700313568115, "logps/chosen": -2.7631256580352783, "logps/rejected": -4.656089782714844, "loss": 0.5175, "rewards/accuracies": 1.0, "rewards/chosen": 0.89429771900177, "rewards/margins": 0.3889264464378357, "rewards/rejected": 0.5053712725639343, "step": 766 }, { "epoch": 0.41, "learning_rate": 9.859533288927448e-08, "logits/chosen": -2.0983822345733643, "logits/rejected": -2.291459083557129, "logps/chosen": -2.7565441131591797, "logps/rejected": -6.827107906341553, "loss": 0.6564, "rewards/accuracies": 1.0, "rewards/chosen": 0.9545386433601379, "rewards/margins": 0.0749901533126831, "rewards/rejected": 0.8795484900474548, "step": 767 }, { "epoch": 0.41, "learning_rate": 9.859018835415942e-08, "logits/chosen": -2.140679359436035, "logits/rejected": -2.1374757289886475, "logps/chosen": -6.449466228485107, "logps/rejected": -2.8457789421081543, "loss": 0.3497, "rewards/accuracies": 1.0, "rewards/chosen": 1.4540224075317383, "rewards/margins": 0.870840847492218, "rewards/rejected": 0.5831815600395203, "step": 768 }, { "epoch": 0.41, "learning_rate": 9.858503455014427e-08, "logits/chosen": -1.9311132431030273, "logits/rejected": -2.279578447341919, "logps/chosen": -10.297505378723145, "logps/rejected": -10.40576457977295, "loss": 0.664, "rewards/accuracies": 1.0, "rewards/chosen": 0.7976216673851013, "rewards/margins": 0.05912822484970093, "rewards/rejected": 0.7384934425354004, "step": 769 }, { "epoch": 0.42, "learning_rate": 9.857987147821214e-08, "logits/chosen": -2.089276075363159, "logits/rejected": -2.2540435791015625, "logps/chosen": -2.075946569442749, "logps/rejected": -10.815690040588379, "loss": 0.6278, "rewards/accuracies": 1.0, "rewards/chosen": 0.9572002291679382, "rewards/margins": 0.13517117500305176, "rewards/rejected": 0.8220290541648865, "step": 770 }, { "epoch": 0.42, "learning_rate": 9.857469913934795e-08, "logits/chosen": -2.0863051414489746, "logits/rejected": -2.086134672164917, "logps/chosen": -3.0417966842651367, "logps/rejected": -1.7542954683303833, "loss": 0.6361, "rewards/accuracies": 1.0, "rewards/chosen": 0.9479327201843262, "rewards/margins": 0.11750555038452148, "rewards/rejected": 0.8304271697998047, "step": 771 }, { "epoch": 0.42, "learning_rate": 9.856951753453836e-08, "logits/chosen": -2.0610716342926025, "logits/rejected": -2.2038707733154297, "logps/chosen": -8.274188041687012, "logps/rejected": -1.3873330354690552, "loss": 0.7143, "rewards/accuracies": 0.0, "rewards/chosen": 0.7476800084114075, "rewards/margins": -0.04186362028121948, "rewards/rejected": 0.789543628692627, "step": 772 }, { "epoch": 0.42, "learning_rate": 9.856432666477179e-08, "logits/chosen": -2.099905014038086, "logits/rejected": -2.102484703063965, "logps/chosen": -1.4328525066375732, "logps/rejected": -8.687981605529785, "loss": 0.4243, "rewards/accuracies": 1.0, "rewards/chosen": 1.1083554029464722, "rewards/margins": 0.6375637054443359, "rewards/rejected": 0.4707917273044586, "step": 773 }, { "epoch": 0.42, "learning_rate": 9.855912653103842e-08, "logits/chosen": -2.073225498199463, "logits/rejected": -2.248345136642456, "logps/chosen": -0.9730974435806274, "logps/rejected": -0.9784987568855286, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.9224919676780701, "rewards/margins": 0.0058768391609191895, "rewards/rejected": 0.9166151285171509, "step": 774 }, { "epoch": 0.42, "learning_rate": 9.855391713433021e-08, "logits/chosen": -1.9898970127105713, "logits/rejected": -2.215919017791748, "logps/chosen": -1.7252435684204102, "logps/rejected": -1.7233788967132568, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.6636360287666321, "rewards/margins": 0.02943629026412964, "rewards/rejected": 0.6341997385025024, "step": 775 }, { "epoch": 0.42, "learning_rate": 9.85486984756409e-08, "logits/chosen": -1.958500862121582, "logits/rejected": -2.2391440868377686, "logps/chosen": -1.8642668724060059, "logps/rejected": -1.9546846151351929, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.9328165054321289, "rewards/margins": 0.04640758037567139, "rewards/rejected": 0.8864089250564575, "step": 776 }, { "epoch": 0.42, "learning_rate": 9.854347055596599e-08, "logits/chosen": -2.0406041145324707, "logits/rejected": -2.032135486602783, "logps/chosen": -8.74119758605957, "logps/rejected": -2.3274857997894287, "loss": 0.5237, "rewards/accuracies": 1.0, "rewards/chosen": 1.279152512550354, "rewards/margins": 0.37347203493118286, "rewards/rejected": 0.9056804776191711, "step": 777 }, { "epoch": 0.42, "learning_rate": 9.853823337630272e-08, "logits/chosen": -2.0774142742156982, "logits/rejected": -2.0868172645568848, "logps/chosen": -5.543358325958252, "logps/rejected": -8.048559188842773, "loss": 0.5104, "rewards/accuracies": 1.0, "rewards/chosen": 1.0797840356826782, "rewards/margins": 0.40653079748153687, "rewards/rejected": 0.6732532382011414, "step": 778 }, { "epoch": 0.42, "learning_rate": 9.853298693765012e-08, "logits/chosen": -2.0240869522094727, "logits/rejected": -2.333273410797119, "logps/chosen": -5.243913173675537, "logps/rejected": -7.658150672912598, "loss": 0.6482, "rewards/accuracies": 1.0, "rewards/chosen": 1.0841476917266846, "rewards/margins": 0.09195572137832642, "rewards/rejected": 0.9921919703483582, "step": 779 }, { "epoch": 0.42, "learning_rate": 9.8527731241009e-08, "logits/chosen": -2.01470947265625, "logits/rejected": -2.0217602252960205, "logps/chosen": -2.4452879428863525, "logps/rejected": -2.2556216716766357, "loss": 0.5581, "rewards/accuracies": 1.0, "rewards/chosen": 0.9027120471000671, "rewards/margins": 0.29119324684143066, "rewards/rejected": 0.6115188002586365, "step": 780 }, { "epoch": 0.42, "learning_rate": 9.85224662873819e-08, "logits/chosen": -2.1154966354370117, "logits/rejected": -2.111685037612915, "logps/chosen": -4.474676132202148, "logps/rejected": -2.261423349380493, "loss": 0.5016, "rewards/accuracies": 1.0, "rewards/chosen": 1.1458735466003418, "rewards/margins": 0.42869043350219727, "rewards/rejected": 0.7171831130981445, "step": 781 }, { "epoch": 0.42, "learning_rate": 9.851719207777317e-08, "logits/chosen": -2.021437406539917, "logits/rejected": -2.251887559890747, "logps/chosen": -3.0997159481048584, "logps/rejected": -3.0621132850646973, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.5893881916999817, "rewards/margins": 0.04207348823547363, "rewards/rejected": 0.5473147034645081, "step": 782 }, { "epoch": 0.42, "learning_rate": 9.851190861318886e-08, "logits/chosen": -2.0121607780456543, "logits/rejected": -2.244248390197754, "logps/chosen": -5.579586029052734, "logps/rejected": -1.6303614377975464, "loss": 0.7243, "rewards/accuracies": 0.0, "rewards/chosen": 0.8040220141410828, "rewards/margins": -0.0614314079284668, "rewards/rejected": 0.8654534220695496, "step": 783 }, { "epoch": 0.42, "learning_rate": 9.850661589463685e-08, "logits/chosen": -2.0559568405151367, "logits/rejected": -2.2655370235443115, "logps/chosen": -0.9093877077102661, "logps/rejected": -0.9138621687889099, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.8179929852485657, "rewards/margins": 0.0025649070739746094, "rewards/rejected": 0.8154280781745911, "step": 784 }, { "epoch": 0.42, "learning_rate": 9.850131392312676e-08, "logits/chosen": -1.9481121301651, "logits/rejected": -2.2410683631896973, "logps/chosen": -3.6126933097839355, "logps/rejected": -3.611720085144043, "loss": 0.7108, "rewards/accuracies": 0.0, "rewards/chosen": 0.6713010668754578, "rewards/margins": -0.03506654500961304, "rewards/rejected": 0.7063676118850708, "step": 785 }, { "epoch": 0.42, "learning_rate": 9.849600269966997e-08, "logits/chosen": -2.141364336013794, "logits/rejected": -2.141462802886963, "logps/chosen": -1.2895927429199219, "logps/rejected": -1.7985575199127197, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": 0.8561536073684692, "rewards/margins": -0.018817484378814697, "rewards/rejected": 0.8749710917472839, "step": 786 }, { "epoch": 0.42, "learning_rate": 9.849068222527965e-08, "logits/chosen": -2.021202325820923, "logits/rejected": -2.230560302734375, "logps/chosen": -1.8505955934524536, "logps/rejected": -1.891943097114563, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.915769100189209, "rewards/margins": 0.014128386974334717, "rewards/rejected": 0.9016407132148743, "step": 787 }, { "epoch": 0.43, "learning_rate": 9.84853525009707e-08, "logits/chosen": -2.0510849952697754, "logits/rejected": -2.2609612941741943, "logps/chosen": -1.1471030712127686, "logps/rejected": -1.0266494750976562, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.7567357420921326, "rewards/margins": 0.014539957046508789, "rewards/rejected": 0.7421957850456238, "step": 788 }, { "epoch": 0.43, "learning_rate": 9.848001352775981e-08, "logits/chosen": -2.078047513961792, "logits/rejected": -2.2767369747161865, "logps/chosen": -1.4176620244979858, "logps/rejected": -1.47873854637146, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 0.8453659415245056, "rewards/margins": 0.03763240575790405, "rewards/rejected": 0.8077335357666016, "step": 789 }, { "epoch": 0.43, "learning_rate": 9.847466530666542e-08, "logits/chosen": -2.078826665878296, "logits/rejected": -2.0884244441986084, "logps/chosen": -1.815751552581787, "logps/rejected": -9.077238082885742, "loss": 0.6429, "rewards/accuracies": 1.0, "rewards/chosen": 0.8605030179023743, "rewards/margins": 0.10315513610839844, "rewards/rejected": 0.7573478817939758, "step": 790 }, { "epoch": 0.43, "learning_rate": 9.846930783870773e-08, "logits/chosen": -2.114062786102295, "logits/rejected": -2.26613712310791, "logps/chosen": -1.337026834487915, "logps/rejected": -1.376441478729248, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0293926000595093, "rewards/margins": 0.018512248992919922, "rewards/rejected": 1.0108803510665894, "step": 791 }, { "epoch": 0.43, "learning_rate": 9.84639411249087e-08, "logits/chosen": -2.0123159885406494, "logits/rejected": -2.2235279083251953, "logps/chosen": -2.0631484985351562, "logps/rejected": -3.932295799255371, "loss": 0.6574, "rewards/accuracies": 1.0, "rewards/chosen": 0.8929451107978821, "rewards/margins": 0.07277184724807739, "rewards/rejected": 0.8201732635498047, "step": 792 }, { "epoch": 0.43, "learning_rate": 9.845856516629211e-08, "logits/chosen": -2.206740379333496, "logits/rejected": -2.0350561141967773, "logps/chosen": -57.22919464111328, "logps/rejected": -1.5285208225250244, "loss": 0.4265, "rewards/accuracies": 1.0, "rewards/chosen": 1.3816803693771362, "rewards/margins": 0.6311917901039124, "rewards/rejected": 0.7504885792732239, "step": 793 }, { "epoch": 0.43, "learning_rate": 9.845317996388342e-08, "logits/chosen": -2.0410144329071045, "logits/rejected": -2.2252275943756104, "logps/chosen": -0.6220470070838928, "logps/rejected": -0.5768219232559204, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.6855636239051819, "rewards/margins": 0.01240992546081543, "rewards/rejected": 0.6731536984443665, "step": 794 }, { "epoch": 0.43, "learning_rate": 9.844778551870991e-08, "logits/chosen": -2.0027060508728027, "logits/rejected": -2.2652525901794434, "logps/chosen": -1.2076642513275146, "logps/rejected": -1.2928979396820068, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.7823311686515808, "rewards/margins": 0.015803992748260498, "rewards/rejected": 0.7665271759033203, "step": 795 }, { "epoch": 0.43, "learning_rate": 9.84423818318006e-08, "logits/chosen": -2.0170047283172607, "logits/rejected": -2.019925117492676, "logps/chosen": -0.6375331282615662, "logps/rejected": -3.4056904315948486, "loss": 0.6182, "rewards/accuracies": 1.0, "rewards/chosen": 0.7206560969352722, "rewards/margins": 0.155958890914917, "rewards/rejected": 0.5646972060203552, "step": 796 }, { "epoch": 0.43, "learning_rate": 9.84369689041863e-08, "logits/chosen": -1.9602829217910767, "logits/rejected": -2.2409467697143555, "logps/chosen": -0.555510401725769, "logps/rejected": -0.5850358009338379, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8057398796081543, "rewards/margins": 0.009763479232788086, "rewards/rejected": 0.7959764003753662, "step": 797 }, { "epoch": 0.43, "learning_rate": 9.843154673689952e-08, "logits/chosen": -1.9713481664657593, "logits/rejected": -2.2400288581848145, "logps/chosen": -1.211277961730957, "logps/rejected": -1.1411449909210205, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9857813119888306, "rewards/margins": 0.019191205501556396, "rewards/rejected": 0.9665901064872742, "step": 798 }, { "epoch": 0.43, "learning_rate": 9.842611533097461e-08, "logits/chosen": -1.9932817220687866, "logits/rejected": -2.2732133865356445, "logps/chosen": -0.6535390615463257, "logps/rejected": -0.6327134370803833, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.956353485584259, "rewards/margins": 0.02001732587814331, "rewards/rejected": 0.9363361597061157, "step": 799 }, { "epoch": 0.43, "learning_rate": 9.842067468744764e-08, "logits/chosen": -2.0295310020446777, "logits/rejected": -2.0251572132110596, "logps/chosen": -7.049286842346191, "logps/rejected": -2.627300500869751, "loss": 0.5325, "rewards/accuracies": 1.0, "rewards/chosen": 1.0092425346374512, "rewards/margins": 0.352170467376709, "rewards/rejected": 0.6570720672607422, "step": 800 }, { "epoch": 0.43, "learning_rate": 9.841522480735643e-08, "logits/chosen": -2.0574212074279785, "logits/rejected": -2.0146379470825195, "logps/chosen": -35.2972412109375, "logps/rejected": -2.123530626296997, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589469790458679, "rewards/margins": 0.005670428276062012, "rewards/rejected": 0.8532765507698059, "step": 801 }, { "epoch": 0.43, "learning_rate": 9.84097656917406e-08, "logits/chosen": -2.117420196533203, "logits/rejected": -2.128175973892212, "logps/chosen": -5.883669853210449, "logps/rejected": -2.86122989654541, "loss": 0.4785, "rewards/accuracies": 1.0, "rewards/chosen": 1.166221022605896, "rewards/margins": 0.48831743001937866, "rewards/rejected": 0.6779035925865173, "step": 802 }, { "epoch": 0.43, "learning_rate": 9.840429734164152e-08, "logits/chosen": -2.140357494354248, "logits/rejected": -2.1638503074645996, "logps/chosen": -1.6432783603668213, "logps/rejected": -8.69632339477539, "loss": 0.4763, "rewards/accuracies": 1.0, "rewards/chosen": 1.0203646421432495, "rewards/margins": 0.49418437480926514, "rewards/rejected": 0.5261802673339844, "step": 803 }, { "epoch": 0.43, "learning_rate": 9.83988197581023e-08, "logits/chosen": -2.144094705581665, "logits/rejected": -2.322767972946167, "logps/chosen": -2.003070116043091, "logps/rejected": -1.7890756130218506, "loss": 0.6982, "rewards/accuracies": 0.0, "rewards/chosen": 0.612970769405365, "rewards/margins": -0.010087072849273682, "rewards/rejected": 0.6230578422546387, "step": 804 }, { "epoch": 0.43, "learning_rate": 9.839333294216781e-08, "logits/chosen": -2.0406126976013184, "logits/rejected": -2.0301780700683594, "logps/chosen": -27.518997192382812, "logps/rejected": -0.6635238528251648, "loss": 0.4779, "rewards/accuracies": 1.0, "rewards/chosen": 1.378474473953247, "rewards/margins": 0.48997050523757935, "rewards/rejected": 0.8885039687156677, "step": 805 }, { "epoch": 0.43, "learning_rate": 9.838783689488472e-08, "logits/chosen": -2.0691120624542236, "logits/rejected": -1.963802456855774, "logps/chosen": -33.07484436035156, "logps/rejected": -5.128142356872559, "loss": 0.7523, "rewards/accuracies": 0.0, "rewards/chosen": 0.7923027276992798, "rewards/margins": -0.11505049467086792, "rewards/rejected": 0.9073532223701477, "step": 806 }, { "epoch": 0.44, "learning_rate": 9.838233161730142e-08, "logits/chosen": -2.1204934120178223, "logits/rejected": -2.045386791229248, "logps/chosen": -30.396373748779297, "logps/rejected": -6.354720115661621, "loss": 0.6056, "rewards/accuracies": 1.0, "rewards/chosen": 0.8258510828018188, "rewards/margins": 0.18344300985336304, "rewards/rejected": 0.6424080729484558, "step": 807 }, { "epoch": 0.44, "learning_rate": 9.83768171104681e-08, "logits/chosen": -2.033780574798584, "logits/rejected": -2.0269598960876465, "logps/chosen": -7.952359676361084, "logps/rejected": -2.288372039794922, "loss": 0.4273, "rewards/accuracies": 1.0, "rewards/chosen": 1.2702970504760742, "rewards/margins": 0.6290971636772156, "rewards/rejected": 0.6411998867988586, "step": 808 }, { "epoch": 0.44, "learning_rate": 9.837129337543667e-08, "logits/chosen": -2.0821168422698975, "logits/rejected": -2.3022055625915527, "logps/chosen": -2.2739310264587402, "logps/rejected": -1.7461016178131104, "loss": 0.6628, "rewards/accuracies": 1.0, "rewards/chosen": 1.1195582151412964, "rewards/margins": 0.06174170970916748, "rewards/rejected": 1.057816505432129, "step": 809 }, { "epoch": 0.44, "learning_rate": 9.836576041326084e-08, "logits/chosen": -2.0981802940368652, "logits/rejected": -2.085378408432007, "logps/chosen": -7.595093250274658, "logps/rejected": -3.8820793628692627, "loss": 0.4856, "rewards/accuracies": 1.0, "rewards/chosen": 1.3103876113891602, "rewards/margins": 0.46972042322158813, "rewards/rejected": 0.840667188167572, "step": 810 }, { "epoch": 0.44, "learning_rate": 9.836021822499602e-08, "logits/chosen": -1.974062442779541, "logits/rejected": -2.2571804523468018, "logps/chosen": -2.678084373474121, "logps/rejected": -1.0280210971832275, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.7458658218383789, "rewards/margins": -0.012363612651824951, "rewards/rejected": 0.7582294344902039, "step": 811 }, { "epoch": 0.44, "learning_rate": 9.835466681169947e-08, "logits/chosen": -2.114071846008301, "logits/rejected": -2.109363555908203, "logps/chosen": -2.408722400665283, "logps/rejected": -6.445779800415039, "loss": 0.496, "rewards/accuracies": 1.0, "rewards/chosen": 1.0254133939743042, "rewards/margins": 0.44293874502182007, "rewards/rejected": 0.5824746489524841, "step": 812 }, { "epoch": 0.44, "learning_rate": 9.834910617443013e-08, "logits/chosen": -2.1218338012695312, "logits/rejected": -2.1153147220611572, "logps/chosen": -7.286620616912842, "logps/rejected": -3.1174979209899902, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 1.086897850036621, "rewards/margins": 0.4242587685585022, "rewards/rejected": 0.6626390814781189, "step": 813 }, { "epoch": 0.44, "learning_rate": 9.834353631424872e-08, "logits/chosen": -2.125758171081543, "logits/rejected": -2.182018280029297, "logps/chosen": -10.113966941833496, "logps/rejected": -21.615882873535156, "loss": 0.4855, "rewards/accuracies": 1.0, "rewards/chosen": 1.1921223402023315, "rewards/margins": 0.47007185220718384, "rewards/rejected": 0.7220504879951477, "step": 814 }, { "epoch": 0.44, "learning_rate": 9.833795723221774e-08, "logits/chosen": -2.041750907897949, "logits/rejected": -2.0484719276428223, "logps/chosen": -2.1889946460723877, "logps/rejected": -13.047432899475098, "loss": 0.569, "rewards/accuracies": 1.0, "rewards/chosen": 1.0082414150238037, "rewards/margins": 0.26590198278427124, "rewards/rejected": 0.7423394322395325, "step": 815 }, { "epoch": 0.44, "learning_rate": 9.833236892940144e-08, "logits/chosen": -2.0774941444396973, "logits/rejected": -1.9719318151474, "logps/chosen": -41.726070404052734, "logps/rejected": -2.1981966495513916, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 1.142438530921936, "rewards/margins": 0.5264302492141724, "rewards/rejected": 0.6160082817077637, "step": 816 }, { "epoch": 0.44, "learning_rate": 9.832677140686582e-08, "logits/chosen": -2.146639347076416, "logits/rejected": -2.2665369510650635, "logps/chosen": -3.137146234512329, "logps/rejected": -3.0043091773986816, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.520094096660614, "rewards/margins": 0.02644526958465576, "rewards/rejected": 0.49364882707595825, "step": 817 }, { "epoch": 0.44, "learning_rate": 9.832116466567866e-08, "logits/chosen": -2.117733955383301, "logits/rejected": -2.118229866027832, "logps/chosen": -2.8812270164489746, "logps/rejected": -3.711061477661133, "loss": 0.5258, "rewards/accuracies": 1.0, "rewards/chosen": 0.9378768801689148, "rewards/margins": 0.36832481622695923, "rewards/rejected": 0.5695520639419556, "step": 818 }, { "epoch": 0.44, "learning_rate": 9.831554870690944e-08, "logits/chosen": -2.0257694721221924, "logits/rejected": -2.013282537460327, "logps/chosen": -9.921590805053711, "logps/rejected": -0.5405534505844116, "loss": 0.5213, "rewards/accuracies": 1.0, "rewards/chosen": 1.2287031412124634, "rewards/margins": 0.3793755769729614, "rewards/rejected": 0.849327564239502, "step": 819 }, { "epoch": 0.44, "learning_rate": 9.83099235316295e-08, "logits/chosen": -2.0586905479431152, "logits/rejected": -2.06502628326416, "logps/chosen": -5.561713695526123, "logps/rejected": -2.1356489658355713, "loss": 0.5644, "rewards/accuracies": 1.0, "rewards/chosen": 0.9996023178100586, "rewards/margins": 0.2765464782714844, "rewards/rejected": 0.7230558395385742, "step": 820 }, { "epoch": 0.44, "learning_rate": 9.830428914091184e-08, "logits/chosen": -2.1728157997131348, "logits/rejected": -2.2378509044647217, "logps/chosen": -1.0910460948944092, "logps/rejected": -1.0561234951019287, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 1.0106158256530762, "rewards/margins": 0.01298379898071289, "rewards/rejected": 0.9976320266723633, "step": 821 }, { "epoch": 0.44, "learning_rate": 9.829864553583126e-08, "logits/chosen": -1.9922209978103638, "logits/rejected": -2.2036118507385254, "logps/chosen": -0.7843589186668396, "logps/rejected": -0.839891254901886, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.8116282820701599, "rewards/margins": 0.008138656616210938, "rewards/rejected": 0.803489625453949, "step": 822 }, { "epoch": 0.44, "learning_rate": 9.829299271746432e-08, "logits/chosen": -2.1347851753234863, "logits/rejected": -2.1351423263549805, "logps/chosen": -1.686468482017517, "logps/rejected": -2.2360033988952637, "loss": 0.5707, "rewards/accuracies": 1.0, "rewards/chosen": 0.9925393462181091, "rewards/margins": 0.26195693016052246, "rewards/rejected": 0.7305824160575867, "step": 823 }, { "epoch": 0.44, "learning_rate": 9.828733068688934e-08, "logits/chosen": -2.0577166080474854, "logits/rejected": -2.0445563793182373, "logps/chosen": -22.949432373046875, "logps/rejected": -5.369709014892578, "loss": 0.6263, "rewards/accuracies": 1.0, "rewards/chosen": 0.9896621704101562, "rewards/margins": 0.13849306106567383, "rewards/rejected": 0.8511691093444824, "step": 824 }, { "epoch": 0.44, "learning_rate": 9.82816594451864e-08, "logits/chosen": -2.0354955196380615, "logits/rejected": -2.03019380569458, "logps/chosen": -5.284384727478027, "logps/rejected": -3.951791524887085, "loss": 0.3502, "rewards/accuracies": 1.0, "rewards/chosen": 1.3286540508270264, "rewards/margins": 0.8691960573196411, "rewards/rejected": 0.45945796370506287, "step": 825 }, { "epoch": 0.45, "learning_rate": 9.82759789934373e-08, "logits/chosen": -2.031304121017456, "logits/rejected": -2.2424261569976807, "logps/chosen": -10.274499893188477, "logps/rejected": -10.430588722229004, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.6643773913383484, "rewards/margins": 0.005700349807739258, "rewards/rejected": 0.6586770415306091, "step": 826 }, { "epoch": 0.45, "learning_rate": 9.827028933272563e-08, "logits/chosen": -1.9653681516647339, "logits/rejected": -2.243900775909424, "logps/chosen": -2.3346991539001465, "logps/rejected": -2.2363240718841553, "loss": 0.6729, "rewards/accuracies": 1.0, "rewards/chosen": 0.9893817901611328, "rewards/margins": 0.040974318981170654, "rewards/rejected": 0.9484074711799622, "step": 827 }, { "epoch": 0.45, "learning_rate": 9.826459046413673e-08, "logits/chosen": -2.1255502700805664, "logits/rejected": -2.030339479446411, "logps/chosen": -34.965599060058594, "logps/rejected": -2.3605589866638184, "loss": 0.4864, "rewards/accuracies": 1.0, "rewards/chosen": 1.1553386449813843, "rewards/margins": 0.4677377939224243, "rewards/rejected": 0.68760085105896, "step": 828 }, { "epoch": 0.45, "learning_rate": 9.825888238875771e-08, "logits/chosen": -2.1166346073150635, "logits/rejected": -2.1177451610565186, "logps/chosen": -2.8838839530944824, "logps/rejected": -2.9103384017944336, "loss": 0.6197, "rewards/accuracies": 1.0, "rewards/chosen": 0.8151359558105469, "rewards/margins": 0.1526969075202942, "rewards/rejected": 0.6624390482902527, "step": 829 }, { "epoch": 0.45, "learning_rate": 9.825316510767743e-08, "logits/chosen": -2.029340982437134, "logits/rejected": -2.028923988342285, "logps/chosen": -6.402166366577148, "logps/rejected": -3.5896236896514893, "loss": 0.5477, "rewards/accuracies": 1.0, "rewards/chosen": 1.0786913633346558, "rewards/margins": 0.31578004360198975, "rewards/rejected": 0.762911319732666, "step": 830 }, { "epoch": 0.45, "learning_rate": 9.824743862198646e-08, "logits/chosen": -2.1422555446624756, "logits/rejected": -2.1123595237731934, "logps/chosen": -6.585239410400391, "logps/rejected": -4.546246528625488, "loss": 0.4393, "rewards/accuracies": 1.0, "rewards/chosen": 1.1356182098388672, "rewards/margins": 0.5949257016181946, "rewards/rejected": 0.5406925082206726, "step": 831 }, { "epoch": 0.45, "learning_rate": 9.824170293277721e-08, "logits/chosen": -1.943886160850525, "logits/rejected": -1.936734676361084, "logps/chosen": -6.168758392333984, "logps/rejected": -3.6127750873565674, "loss": 0.4147, "rewards/accuracies": 1.0, "rewards/chosen": 1.2802051305770874, "rewards/margins": 0.6656050086021423, "rewards/rejected": 0.6146001219749451, "step": 832 }, { "epoch": 0.45, "learning_rate": 9.823595804114378e-08, "logits/chosen": -2.1188220977783203, "logits/rejected": -2.1112914085388184, "logps/chosen": -16.807228088378906, "logps/rejected": -11.079776763916016, "loss": 0.5524, "rewards/accuracies": 1.0, "rewards/chosen": 1.0722589492797852, "rewards/margins": 0.30471307039260864, "rewards/rejected": 0.7675458788871765, "step": 833 }, { "epoch": 0.45, "learning_rate": 9.823020394818202e-08, "logits/chosen": -2.1756677627563477, "logits/rejected": -2.31882643699646, "logps/chosen": -4.562434196472168, "logps/rejected": -1.7786617279052734, "loss": 0.7962, "rewards/accuracies": 0.0, "rewards/chosen": 0.6388399004936218, "rewards/margins": -0.1965419054031372, "rewards/rejected": 0.835381805896759, "step": 834 }, { "epoch": 0.45, "learning_rate": 9.822444065498963e-08, "logits/chosen": -1.9865596294403076, "logits/rejected": -1.97584068775177, "logps/chosen": -8.499244689941406, "logps/rejected": -4.381789207458496, "loss": 0.4537, "rewards/accuracies": 1.0, "rewards/chosen": 1.2850698232650757, "rewards/margins": 0.5549767017364502, "rewards/rejected": 0.7300931215286255, "step": 835 }, { "epoch": 0.45, "learning_rate": 9.821866816266593e-08, "logits/chosen": -2.1077675819396973, "logits/rejected": -2.106473684310913, "logps/chosen": -1.6167417764663696, "logps/rejected": -1.3115389347076416, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.8269662857055664, "rewards/margins": 0.018754661083221436, "rewards/rejected": 0.808211624622345, "step": 836 }, { "epoch": 0.45, "learning_rate": 9.821288647231207e-08, "logits/chosen": -2.0675017833709717, "logits/rejected": -2.057302474975586, "logps/chosen": -4.5135579109191895, "logps/rejected": -7.5239973068237305, "loss": 0.5227, "rewards/accuracies": 1.0, "rewards/chosen": 0.8109939694404602, "rewards/margins": 0.37599918246269226, "rewards/rejected": 0.43499478697776794, "step": 837 }, { "epoch": 0.45, "learning_rate": 9.8207095585031e-08, "logits/chosen": -2.130689859390259, "logits/rejected": -2.079197406768799, "logps/chosen": -32.908451080322266, "logps/rejected": -2.9197964668273926, "loss": 0.552, "rewards/accuracies": 1.0, "rewards/chosen": 1.110744833946228, "rewards/margins": 0.305556058883667, "rewards/rejected": 0.805188775062561, "step": 838 }, { "epoch": 0.45, "learning_rate": 9.820129550192731e-08, "logits/chosen": -2.003917932510376, "logits/rejected": -1.9935240745544434, "logps/chosen": -13.158180236816406, "logps/rejected": -3.0205764770507812, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": 0.877571702003479, "rewards/margins": 0.06548517942428589, "rewards/rejected": 0.8120865225791931, "step": 839 }, { "epoch": 0.45, "learning_rate": 9.819548622410742e-08, "logits/chosen": -2.1584179401397705, "logits/rejected": -2.306725025177002, "logps/chosen": -1.5542523860931396, "logps/rejected": -1.5531975030899048, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0261257886886597, "rewards/margins": 0.011133313179016113, "rewards/rejected": 1.0149924755096436, "step": 840 }, { "epoch": 0.45, "learning_rate": 9.818966775267949e-08, "logits/chosen": -2.033841609954834, "logits/rejected": -2.0293362140655518, "logps/chosen": -6.814944744110107, "logps/rejected": -2.3454887866973877, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 1.2894508838653564, "rewards/margins": 0.6748523116111755, "rewards/rejected": 0.6145985722541809, "step": 841 }, { "epoch": 0.45, "learning_rate": 9.818384008875345e-08, "logits/chosen": -2.097414493560791, "logits/rejected": -2.1024932861328125, "logps/chosen": -2.470613956451416, "logps/rejected": -2.1383326053619385, "loss": 0.5085, "rewards/accuracies": 1.0, "rewards/chosen": 1.0307234525680542, "rewards/margins": 0.4114040732383728, "rewards/rejected": 0.6193193793296814, "step": 842 }, { "epoch": 0.45, "learning_rate": 9.817800323344095e-08, "logits/chosen": -2.151911735534668, "logits/rejected": -2.1587750911712646, "logps/chosen": -2.931948661804199, "logps/rejected": -3.1749837398529053, "loss": 0.515, "rewards/accuracies": 1.0, "rewards/chosen": 1.0255892276763916, "rewards/margins": 0.3949926495552063, "rewards/rejected": 0.6305965781211853, "step": 843 }, { "epoch": 0.46, "learning_rate": 9.81721571878554e-08, "logits/chosen": -2.0888195037841797, "logits/rejected": -2.096224308013916, "logps/chosen": -3.191157341003418, "logps/rejected": -2.6837382316589355, "loss": 0.5422, "rewards/accuracies": 1.0, "rewards/chosen": 1.1915433406829834, "rewards/margins": 0.3287961483001709, "rewards/rejected": 0.8627471923828125, "step": 844 }, { "epoch": 0.46, "learning_rate": 9.8166301953112e-08, "logits/chosen": -1.9662187099456787, "logits/rejected": -1.9670089483261108, "logps/chosen": -4.67142915725708, "logps/rejected": -1.4181246757507324, "loss": 0.5536, "rewards/accuracies": 1.0, "rewards/chosen": 1.1562508344650269, "rewards/margins": 0.3017740845680237, "rewards/rejected": 0.8544767498970032, "step": 845 }, { "epoch": 0.46, "learning_rate": 9.816043753032766e-08, "logits/chosen": -2.0019540786743164, "logits/rejected": -2.3435819149017334, "logps/chosen": -1.5458091497421265, "logps/rejected": -1.3508472442626953, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 0.8171802759170532, "rewards/margins": 0.038433969020843506, "rewards/rejected": 0.7787463068962097, "step": 846 }, { "epoch": 0.46, "learning_rate": 9.815456392062102e-08, "logits/chosen": -2.0524795055389404, "logits/rejected": -2.249283790588379, "logps/chosen": -3.2909960746765137, "logps/rejected": -8.008631706237793, "loss": 0.767, "rewards/accuracies": 0.0, "rewards/chosen": 0.7856904864311218, "rewards/margins": -0.1426127552986145, "rewards/rejected": 0.9283032417297363, "step": 847 }, { "epoch": 0.46, "learning_rate": 9.814868112511257e-08, "logits/chosen": -2.069335699081421, "logits/rejected": -2.0537705421447754, "logps/chosen": -11.993854522705078, "logps/rejected": -3.19677472114563, "loss": 0.4687, "rewards/accuracies": 1.0, "rewards/chosen": 1.1941263675689697, "rewards/margins": 0.51418536901474, "rewards/rejected": 0.6799409985542297, "step": 848 }, { "epoch": 0.46, "learning_rate": 9.814278914492446e-08, "logits/chosen": -2.0650196075439453, "logits/rejected": -2.2054290771484375, "logps/chosen": -3.7938742637634277, "logps/rejected": -3.6221747398376465, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.5732724666595459, "rewards/margins": 0.01623058319091797, "rewards/rejected": 0.5570418834686279, "step": 849 }, { "epoch": 0.46, "learning_rate": 9.813688798118065e-08, "logits/chosen": -2.111302137374878, "logits/rejected": -2.106518268585205, "logps/chosen": -7.674297332763672, "logps/rejected": -3.4210915565490723, "loss": 0.5084, "rewards/accuracies": 1.0, "rewards/chosen": 1.0199469327926636, "rewards/margins": 0.41153591871261597, "rewards/rejected": 0.6084110140800476, "step": 850 }, { "epoch": 0.46, "learning_rate": 9.813097763500677e-08, "logits/chosen": -1.9812508821487427, "logits/rejected": -2.353654384613037, "logps/chosen": -9.102294921875, "logps/rejected": -21.970205307006836, "loss": 0.5831, "rewards/accuracies": 1.0, "rewards/chosen": 0.7378048300743103, "rewards/margins": 0.23380661010742188, "rewards/rejected": 0.5039982199668884, "step": 851 }, { "epoch": 0.46, "learning_rate": 9.812505810753033e-08, "logits/chosen": -2.0523898601531982, "logits/rejected": -2.0576863288879395, "logps/chosen": -2.6386220455169678, "logps/rejected": -3.4336211681365967, "loss": 0.5417, "rewards/accuracies": 1.0, "rewards/chosen": 0.9517048001289368, "rewards/margins": 0.3300851583480835, "rewards/rejected": 0.6216196417808533, "step": 852 }, { "epoch": 0.46, "learning_rate": 9.811912939988047e-08, "logits/chosen": -2.024656295776367, "logits/rejected": -2.2341270446777344, "logps/chosen": -0.9969562292098999, "logps/rejected": -1.0155261754989624, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.9627217650413513, "rewards/margins": 0.0008778572082519531, "rewards/rejected": 0.9618439078330994, "step": 853 }, { "epoch": 0.46, "learning_rate": 9.811319151318814e-08, "logits/chosen": -2.020484447479248, "logits/rejected": -2.0189368724823, "logps/chosen": -1.1709983348846436, "logps/rejected": -1.2693839073181152, "loss": 0.636, "rewards/accuracies": 1.0, "rewards/chosen": 0.9561524391174316, "rewards/margins": 0.11786508560180664, "rewards/rejected": 0.838287353515625, "step": 854 }, { "epoch": 0.46, "learning_rate": 9.810724444858602e-08, "logits/chosen": -2.032787799835205, "logits/rejected": -2.0240774154663086, "logps/chosen": -4.367117404937744, "logps/rejected": -4.02483606338501, "loss": 0.436, "rewards/accuracies": 1.0, "rewards/chosen": 1.1863771677017212, "rewards/margins": 0.6043407917022705, "rewards/rejected": 0.5820363759994507, "step": 855 }, { "epoch": 0.46, "learning_rate": 9.810128820720858e-08, "logits/chosen": -2.1055119037628174, "logits/rejected": -2.0922303199768066, "logps/chosen": -18.48347282409668, "logps/rejected": -4.784005641937256, "loss": 0.4448, "rewards/accuracies": 1.0, "rewards/chosen": 1.0734052658081055, "rewards/margins": 0.5794975757598877, "rewards/rejected": 0.4939076900482178, "step": 856 }, { "epoch": 0.46, "learning_rate": 9.809532279019202e-08, "logits/chosen": -1.9815260171890259, "logits/rejected": -2.244414806365967, "logps/chosen": -3.793588638305664, "logps/rejected": -6.395710468292236, "loss": 0.6453, "rewards/accuracies": 1.0, "rewards/chosen": 0.8936561942100525, "rewards/margins": 0.0981256365776062, "rewards/rejected": 0.7955305576324463, "step": 857 }, { "epoch": 0.46, "learning_rate": 9.808934819867423e-08, "logits/chosen": -2.102083444595337, "logits/rejected": -2.2669081687927246, "logps/chosen": -1.2302199602127075, "logps/rejected": -1.2176827192306519, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.8173791766166687, "rewards/margins": -0.006992161273956299, "rewards/rejected": 0.824371337890625, "step": 858 }, { "epoch": 0.46, "learning_rate": 9.808336443379494e-08, "logits/chosen": -2.030914306640625, "logits/rejected": -2.2914631366729736, "logps/chosen": -0.9441138505935669, "logps/rejected": -0.875032365322113, "loss": 0.7005, "rewards/accuracies": 0.0, "rewards/chosen": 0.8003302812576294, "rewards/margins": -0.014706313610076904, "rewards/rejected": 0.8150365948677063, "step": 859 }, { "epoch": 0.46, "learning_rate": 9.80773714966956e-08, "logits/chosen": -2.0653326511383057, "logits/rejected": -2.2319962978363037, "logps/chosen": -0.6167787909507751, "logps/rejected": -0.66422039270401, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.8268195986747742, "rewards/margins": 0.035969555377960205, "rewards/rejected": 0.790850043296814, "step": 860 }, { "epoch": 0.46, "learning_rate": 9.807136938851938e-08, "logits/chosen": -2.1371355056762695, "logits/rejected": -2.1424708366394043, "logps/chosen": -3.7331767082214355, "logps/rejected": -2.975773334503174, "loss": 0.5873, "rewards/accuracies": 1.0, "rewards/chosen": 0.9475303888320923, "rewards/margins": 0.22432255744934082, "rewards/rejected": 0.7232078313827515, "step": 861 }, { "epoch": 0.46, "learning_rate": 9.806535811041125e-08, "logits/chosen": -2.001009225845337, "logits/rejected": -2.260934352874756, "logps/chosen": -11.844461441040039, "logps/rejected": -13.55362606048584, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9438081979751587, "rewards/margins": 0.0010585188865661621, "rewards/rejected": 0.9427496790885925, "step": 862 }, { "epoch": 0.47, "learning_rate": 9.805933766351786e-08, "logits/chosen": -2.1415345668792725, "logits/rejected": -2.158162832260132, "logps/chosen": -15.427772521972656, "logps/rejected": -9.275177001953125, "loss": 0.4586, "rewards/accuracies": 1.0, "rewards/chosen": 1.431592583656311, "rewards/margins": 0.5415071845054626, "rewards/rejected": 0.8900853991508484, "step": 863 }, { "epoch": 0.47, "learning_rate": 9.805330804898769e-08, "logits/chosen": -2.1119163036346436, "logits/rejected": -2.275313138961792, "logps/chosen": -1.6595866680145264, "logps/rejected": -1.8020106554031372, "loss": 0.6448, "rewards/accuracies": 1.0, "rewards/chosen": 0.9041858911514282, "rewards/margins": 0.09919989109039307, "rewards/rejected": 0.8049860000610352, "step": 864 }, { "epoch": 0.47, "learning_rate": 9.804726926797092e-08, "logits/chosen": -2.122232675552368, "logits/rejected": -2.1170246601104736, "logps/chosen": -4.823853492736816, "logps/rejected": -3.0270915031433105, "loss": 0.4766, "rewards/accuracies": 1.0, "rewards/chosen": 1.005860447883606, "rewards/margins": 0.49340587854385376, "rewards/rejected": 0.5124545693397522, "step": 865 }, { "epoch": 0.47, "learning_rate": 9.804122132161945e-08, "logits/chosen": -2.0651838779449463, "logits/rejected": -2.2462804317474365, "logps/chosen": -7.407602787017822, "logps/rejected": -10.347253799438477, "loss": 0.6639, "rewards/accuracies": 1.0, "rewards/chosen": 0.7357476949691772, "rewards/margins": 0.059415578842163086, "rewards/rejected": 0.6763321161270142, "step": 866 }, { "epoch": 0.47, "learning_rate": 9.803516421108702e-08, "logits/chosen": -2.0591304302215576, "logits/rejected": -2.060110330581665, "logps/chosen": -1.1200779676437378, "logps/rejected": -3.020578622817993, "loss": 0.5406, "rewards/accuracies": 1.0, "rewards/chosen": 0.9162298440933228, "rewards/margins": 0.33263683319091797, "rewards/rejected": 0.5835930109024048, "step": 867 }, { "epoch": 0.47, "learning_rate": 9.802909793752903e-08, "logits/chosen": -1.9559133052825928, "logits/rejected": -1.9515597820281982, "logps/chosen": -8.638445854187012, "logps/rejected": -2.412555456161499, "loss": 0.3975, "rewards/accuracies": 1.0, "rewards/chosen": 1.34744393825531, "rewards/margins": 0.7173811793327332, "rewards/rejected": 0.6300627589225769, "step": 868 }, { "epoch": 0.47, "learning_rate": 9.802302250210266e-08, "logits/chosen": -2.0159237384796143, "logits/rejected": -2.2513864040374756, "logps/chosen": -1.1697797775268555, "logps/rejected": -1.168813943862915, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.8706780672073364, "rewards/margins": 0.0063468217849731445, "rewards/rejected": 0.8643312454223633, "step": 869 }, { "epoch": 0.47, "learning_rate": 9.801693790596686e-08, "logits/chosen": -2.157787799835205, "logits/rejected": -2.3481876850128174, "logps/chosen": -1.1059142351150513, "logps/rejected": -1.181742548942566, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 1.0211466550827026, "rewards/margins": 0.020325779914855957, "rewards/rejected": 1.0008208751678467, "step": 870 }, { "epoch": 0.47, "learning_rate": 9.80108441502823e-08, "logits/chosen": -2.1193816661834717, "logits/rejected": -2.3648643493652344, "logps/chosen": -14.748191833496094, "logps/rejected": -20.23917007446289, "loss": 0.6236, "rewards/accuracies": 1.0, "rewards/chosen": 0.48252448439598083, "rewards/margins": 0.14435195922851562, "rewards/rejected": 0.3381725251674652, "step": 871 }, { "epoch": 0.47, "learning_rate": 9.80047412362114e-08, "logits/chosen": -2.0207958221435547, "logits/rejected": -2.024857759475708, "logps/chosen": -1.0170717239379883, "logps/rejected": -2.873303174972534, "loss": 0.5721, "rewards/accuracies": 1.0, "rewards/chosen": 0.7817218899726868, "rewards/margins": 0.2588456869125366, "rewards/rejected": 0.5228762030601501, "step": 872 }, { "epoch": 0.47, "learning_rate": 9.799862916491833e-08, "logits/chosen": -2.1663620471954346, "logits/rejected": -2.2963991165161133, "logps/chosen": -5.2252678871154785, "logps/rejected": -7.896876335144043, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 0.8721744418144226, "rewards/margins": 0.13866376876831055, "rewards/rejected": 0.7335106730461121, "step": 873 }, { "epoch": 0.47, "learning_rate": 9.799250793756901e-08, "logits/chosen": -2.063148021697998, "logits/rejected": -2.210998058319092, "logps/chosen": -2.2102174758911133, "logps/rejected": -2.814488172531128, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 0.8376584053039551, "rewards/margins": 0.03097158670425415, "rewards/rejected": 0.8066868185997009, "step": 874 }, { "epoch": 0.47, "learning_rate": 9.798637755533111e-08, "logits/chosen": -2.1890573501586914, "logits/rejected": -2.2181782722473145, "logps/chosen": -6.907183647155762, "logps/rejected": -27.887500762939453, "loss": 0.623, "rewards/accuracies": 1.0, "rewards/chosen": 0.9488034248352051, "rewards/margins": 0.14568454027175903, "rewards/rejected": 0.803118884563446, "step": 875 }, { "epoch": 0.47, "learning_rate": 9.798023801937404e-08, "logits/chosen": -2.113515853881836, "logits/rejected": -2.1084375381469727, "logps/chosen": -9.327875137329102, "logps/rejected": -3.6746177673339844, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 1.1584008932113647, "rewards/margins": 0.5763630270957947, "rewards/rejected": 0.5820378661155701, "step": 876 }, { "epoch": 0.47, "learning_rate": 9.797408933086894e-08, "logits/chosen": -2.155538558959961, "logits/rejected": -2.3037569522857666, "logps/chosen": -4.751223564147949, "logps/rejected": -4.746417999267578, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.7272880673408508, "rewards/margins": 0.0006226301193237305, "rewards/rejected": 0.7266654372215271, "step": 877 }, { "epoch": 0.47, "learning_rate": 9.796793149098874e-08, "logits/chosen": -2.1476612091064453, "logits/rejected": -2.148716688156128, "logps/chosen": -3.3679003715515137, "logps/rejected": -12.019411087036133, "loss": 0.6212, "rewards/accuracies": 1.0, "rewards/chosen": 0.8929864764213562, "rewards/margins": 0.14940202236175537, "rewards/rejected": 0.7435844540596008, "step": 878 }, { "epoch": 0.47, "learning_rate": 9.796176450090808e-08, "logits/chosen": -2.1196019649505615, "logits/rejected": -2.2927892208099365, "logps/chosen": -0.9298751354217529, "logps/rejected": -0.9614865779876709, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.8213347792625427, "rewards/margins": 0.028040647506713867, "rewards/rejected": 0.7932941317558289, "step": 879 }, { "epoch": 0.47, "learning_rate": 9.795558836180334e-08, "logits/chosen": -2.0042426586151123, "logits/rejected": -1.9993125200271606, "logps/chosen": -8.221110343933105, "logps/rejected": -13.549951553344727, "loss": 0.5811, "rewards/accuracies": 1.0, "rewards/chosen": 1.080410361289978, "rewards/margins": 0.23829644918441772, "rewards/rejected": 0.8421139121055603, "step": 880 }, { "epoch": 0.48, "learning_rate": 9.794940307485268e-08, "logits/chosen": -1.9800374507904053, "logits/rejected": -1.9856865406036377, "logps/chosen": -2.583575963973999, "logps/rejected": -2.907933235168457, "loss": 0.5247, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110011458396912, "rewards/margins": 0.37123262882232666, "rewards/rejected": 0.5397685170173645, "step": 881 }, { "epoch": 0.48, "learning_rate": 9.794320864123595e-08, "logits/chosen": -2.132052421569824, "logits/rejected": -2.0045535564422607, "logps/chosen": -38.48339080810547, "logps/rejected": -4.124563217163086, "loss": 0.4748, "rewards/accuracies": 1.0, "rewards/chosen": 1.010239839553833, "rewards/margins": 0.4982016086578369, "rewards/rejected": 0.5120382308959961, "step": 882 }, { "epoch": 0.48, "learning_rate": 9.793700506213482e-08, "logits/chosen": -2.0746841430664062, "logits/rejected": -2.0154857635498047, "logps/chosen": -26.832988739013672, "logps/rejected": -2.744938850402832, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 1.0283504724502563, "rewards/margins": 0.2800547480583191, "rewards/rejected": 0.7482957243919373, "step": 883 }, { "epoch": 0.48, "learning_rate": 9.793079233873265e-08, "logits/chosen": -2.0759172439575195, "logits/rejected": -2.2738475799560547, "logps/chosen": -0.9337854385375977, "logps/rejected": -1.024334192276001, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.675306499004364, "rewards/margins": 0.016648173332214355, "rewards/rejected": 0.6586583256721497, "step": 884 }, { "epoch": 0.48, "learning_rate": 9.792457047221456e-08, "logits/chosen": -1.9826980829238892, "logits/rejected": -2.248530626296997, "logps/chosen": -3.7348928451538086, "logps/rejected": -3.8637969493865967, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 1.0983067750930786, "rewards/margins": 0.03152871131896973, "rewards/rejected": 1.0667780637741089, "step": 885 }, { "epoch": 0.48, "learning_rate": 9.791833946376739e-08, "logits/chosen": -2.1146445274353027, "logits/rejected": -2.1035079956054688, "logps/chosen": -5.3232221603393555, "logps/rejected": -7.269735336303711, "loss": 0.5578, "rewards/accuracies": 1.0, "rewards/chosen": 1.007260799407959, "rewards/margins": 0.29201823472976685, "rewards/rejected": 0.7152425646781921, "step": 886 }, { "epoch": 0.48, "learning_rate": 9.791209931457976e-08, "logits/chosen": -2.1213624477386475, "logits/rejected": -2.121809244155884, "logps/chosen": -2.7195346355438232, "logps/rejected": -2.7468209266662598, "loss": 0.6135, "rewards/accuracies": 1.0, "rewards/chosen": 0.9378744959831238, "rewards/margins": 0.16616088151931763, "rewards/rejected": 0.7717136144638062, "step": 887 }, { "epoch": 0.48, "learning_rate": 9.790585002584204e-08, "logits/chosen": -1.997352123260498, "logits/rejected": -2.010124921798706, "logps/chosen": -3.9874684810638428, "logps/rejected": -7.755928039550781, "loss": 0.4836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0538276433944702, "rewards/margins": 0.47488933801651, "rewards/rejected": 0.5789383053779602, "step": 888 }, { "epoch": 0.48, "learning_rate": 9.78995915987463e-08, "logits/chosen": -2.1695797443389893, "logits/rejected": -2.1703240871429443, "logps/chosen": -2.0873141288757324, "logps/rejected": -1.3490270376205444, "loss": 0.6235, "rewards/accuracies": 1.0, "rewards/chosen": 1.1630932092666626, "rewards/margins": 0.14446747303009033, "rewards/rejected": 1.0186257362365723, "step": 889 }, { "epoch": 0.48, "learning_rate": 9.789332403448638e-08, "logits/chosen": -2.078972339630127, "logits/rejected": -2.0662198066711426, "logps/chosen": -8.274059295654297, "logps/rejected": -2.4796814918518066, "loss": 0.5286, "rewards/accuracies": 1.0, "rewards/chosen": 0.9938560724258423, "rewards/margins": 0.3615376353263855, "rewards/rejected": 0.6323184370994568, "step": 890 }, { "epoch": 0.48, "learning_rate": 9.788704733425786e-08, "logits/chosen": -2.177171230316162, "logits/rejected": -2.1308538913726807, "logps/chosen": -20.777313232421875, "logps/rejected": -3.4248011112213135, "loss": 0.4225, "rewards/accuracies": 1.0, "rewards/chosen": 1.2500953674316406, "rewards/margins": 0.6427696943283081, "rewards/rejected": 0.6073256731033325, "step": 891 }, { "epoch": 0.48, "learning_rate": 9.788076149925806e-08, "logits/chosen": -2.0746536254882812, "logits/rejected": -2.2694597244262695, "logps/chosen": -1.6385611295700073, "logps/rejected": -5.63430118560791, "loss": 0.6013, "rewards/accuracies": 1.0, "rewards/chosen": 0.782646119594574, "rewards/margins": 0.1928882598876953, "rewards/rejected": 0.5897578597068787, "step": 892 }, { "epoch": 0.48, "learning_rate": 9.787446653068606e-08, "logits/chosen": -2.1098670959472656, "logits/rejected": -2.1175537109375, "logps/chosen": -2.1779580116271973, "logps/rejected": -3.384554862976074, "loss": 0.5517, "rewards/accuracies": 1.0, "rewards/chosen": 0.987658679485321, "rewards/margins": 0.3062868118286133, "rewards/rejected": 0.6813718676567078, "step": 893 }, { "epoch": 0.48, "learning_rate": 9.786816242974265e-08, "logits/chosen": -1.9329742193222046, "logits/rejected": -1.9053688049316406, "logps/chosen": -12.178325653076172, "logps/rejected": -3.0571553707122803, "loss": 0.6105, "rewards/accuracies": 1.0, "rewards/chosen": 0.7856577038764954, "rewards/margins": 0.1726595163345337, "rewards/rejected": 0.6129981875419617, "step": 894 }, { "epoch": 0.48, "learning_rate": 9.786184919763039e-08, "logits/chosen": -2.13904070854187, "logits/rejected": -2.209772825241089, "logps/chosen": -0.9619069695472717, "logps/rejected": -2.493633508682251, "loss": 0.6431, "rewards/accuracies": 1.0, "rewards/chosen": 0.8838689923286438, "rewards/margins": 0.10282230377197266, "rewards/rejected": 0.7810466885566711, "step": 895 }, { "epoch": 0.48, "learning_rate": 9.785552683555356e-08, "logits/chosen": -2.029128074645996, "logits/rejected": -2.0282766819000244, "logps/chosen": -8.0355224609375, "logps/rejected": -3.420048952102661, "loss": 0.3492, "rewards/accuracies": 1.0, "rewards/chosen": 1.3879978656768799, "rewards/margins": 0.8722804188728333, "rewards/rejected": 0.5157174468040466, "step": 896 }, { "epoch": 0.48, "learning_rate": 9.784919534471819e-08, "logits/chosen": -1.9592148065567017, "logits/rejected": -2.207362413406372, "logps/chosen": -1.2042322158813477, "logps/rejected": -3.487091541290283, "loss": 0.5644, "rewards/accuracies": 1.0, "rewards/chosen": 0.9527460336685181, "rewards/margins": 0.2765629291534424, "rewards/rejected": 0.6761831045150757, "step": 897 }, { "epoch": 0.48, "learning_rate": 9.784285472633207e-08, "logits/chosen": -2.172316312789917, "logits/rejected": -2.1781952381134033, "logps/chosen": -2.3622043132781982, "logps/rejected": -1.3496500253677368, "loss": 0.5351, "rewards/accuracies": 1.0, "rewards/chosen": 0.9704346656799316, "rewards/margins": 0.34594184160232544, "rewards/rejected": 0.6244928240776062, "step": 898 }, { "epoch": 0.48, "learning_rate": 9.78365049816047e-08, "logits/chosen": -2.1060917377471924, "logits/rejected": -2.1049649715423584, "logps/chosen": -4.419621467590332, "logps/rejected": -1.612679362297058, "loss": 0.6072, "rewards/accuracies": 1.0, "rewards/chosen": 1.1609858274459839, "rewards/margins": 0.18002372980117798, "rewards/rejected": 0.9809620976448059, "step": 899 }, { "epoch": 0.49, "learning_rate": 9.783014611174736e-08, "logits/chosen": -1.9943037033081055, "logits/rejected": -2.2870914936065674, "logps/chosen": -0.8848901987075806, "logps/rejected": -1.044468641281128, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.830978512763977, "rewards/margins": 0.03207141160964966, "rewards/rejected": 0.7989071011543274, "step": 900 }, { "epoch": 0.49, "learning_rate": 9.782377811797301e-08, "logits/chosen": -2.130711793899536, "logits/rejected": -2.130124092102051, "logps/chosen": -1.2711586952209473, "logps/rejected": -1.8897016048431396, "loss": 0.7122, "rewards/accuracies": 0.0, "rewards/chosen": 0.8753301501274109, "rewards/margins": -0.037673234939575195, "rewards/rejected": 0.9130033850669861, "step": 901 }, { "epoch": 0.49, "learning_rate": 9.781740100149641e-08, "logits/chosen": -2.0410897731781006, "logits/rejected": -2.2421376705169678, "logps/chosen": -1.5849909782409668, "logps/rejected": -1.4787129163742065, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.736752450466156, "rewards/margins": 0.002056419849395752, "rewards/rejected": 0.7346960306167603, "step": 902 }, { "epoch": 0.49, "learning_rate": 9.781101476353404e-08, "logits/chosen": -2.0629005432128906, "logits/rejected": -2.0583653450012207, "logps/chosen": -0.48215240240097046, "logps/rejected": -10.632331848144531, "loss": 0.5625, "rewards/accuracies": 1.0, "rewards/chosen": 0.843092143535614, "rewards/margins": 0.2809464931488037, "rewards/rejected": 0.5621456503868103, "step": 903 }, { "epoch": 0.49, "learning_rate": 9.78046194053041e-08, "logits/chosen": -1.9964927434921265, "logits/rejected": -1.979577660560608, "logps/chosen": -37.50559997558594, "logps/rejected": -7.642465114593506, "loss": 0.479, "rewards/accuracies": 1.0, "rewards/chosen": 0.9396259188652039, "rewards/margins": 0.4869633913040161, "rewards/rejected": 0.45266252756118774, "step": 904 }, { "epoch": 0.49, "learning_rate": 9.779821492802655e-08, "logits/chosen": -2.119168996810913, "logits/rejected": -2.122803211212158, "logps/chosen": -3.8094069957733154, "logps/rejected": -12.409721374511719, "loss": 0.3409, "rewards/accuracies": 1.0, "rewards/chosen": 1.4026858806610107, "rewards/margins": 0.9009997844696045, "rewards/rejected": 0.5016860961914062, "step": 905 }, { "epoch": 0.49, "learning_rate": 9.779180133292312e-08, "logits/chosen": -2.038355588912964, "logits/rejected": -2.235339641571045, "logps/chosen": -0.8172494769096375, "logps/rejected": -0.8633102178573608, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.7411038279533386, "rewards/margins": 0.009634912014007568, "rewards/rejected": 0.731468915939331, "step": 906 }, { "epoch": 0.49, "learning_rate": 9.778537862121721e-08, "logits/chosen": -2.064199209213257, "logits/rejected": -2.0660033226013184, "logps/chosen": -0.940683126449585, "logps/rejected": -1.522592544555664, "loss": 0.5917, "rewards/accuracies": 1.0, "rewards/chosen": 1.025255560874939, "rewards/margins": 0.2143750786781311, "rewards/rejected": 0.8108804821968079, "step": 907 }, { "epoch": 0.49, "learning_rate": 9.777894679413399e-08, "logits/chosen": -1.9603264331817627, "logits/rejected": -1.9603660106658936, "logps/chosen": -2.0886495113372803, "logps/rejected": -1.113049030303955, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.8274378776550293, "rewards/margins": -0.01051551103591919, "rewards/rejected": 0.8379533886909485, "step": 908 }, { "epoch": 0.49, "learning_rate": 9.77725058529004e-08, "logits/chosen": -2.058526039123535, "logits/rejected": -2.273348569869995, "logps/chosen": -13.615252494812012, "logps/rejected": -15.337685585021973, "loss": 0.5956, "rewards/accuracies": 1.0, "rewards/chosen": 0.9384558796882629, "rewards/margins": 0.20554065704345703, "rewards/rejected": 0.7329152226448059, "step": 909 }, { "epoch": 0.49, "learning_rate": 9.776605579874507e-08, "logits/chosen": -2.0569543838500977, "logits/rejected": -2.2906219959259033, "logps/chosen": -5.281612396240234, "logps/rejected": -1.4879382848739624, "loss": 0.9014, "rewards/accuracies": 0.0, "rewards/chosen": 0.6604714393615723, "rewards/margins": -0.3805474042892456, "rewards/rejected": 1.0410188436508179, "step": 910 }, { "epoch": 0.49, "learning_rate": 9.775959663289841e-08, "logits/chosen": -2.0233333110809326, "logits/rejected": -2.0177557468414307, "logps/chosen": -3.0892038345336914, "logps/rejected": -8.474337577819824, "loss": 0.4729, "rewards/accuracies": 1.0, "rewards/chosen": 0.9987576603889465, "rewards/margins": 0.5031974911689758, "rewards/rejected": 0.4955601692199707, "step": 911 }, { "epoch": 0.49, "learning_rate": 9.775312835659252e-08, "logits/chosen": -2.029083728790283, "logits/rejected": -2.2460594177246094, "logps/chosen": -0.8671678900718689, "logps/rejected": -0.8471834063529968, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8252033591270447, "rewards/margins": 0.012067079544067383, "rewards/rejected": 0.8131362795829773, "step": 912 }, { "epoch": 0.49, "learning_rate": 9.774665097106132e-08, "logits/chosen": -2.0908095836639404, "logits/rejected": -2.0995733737945557, "logps/chosen": -2.148453712463379, "logps/rejected": -1.8941535949707031, "loss": 0.5376, "rewards/accuracies": 1.0, "rewards/chosen": 1.1276901960372925, "rewards/margins": 0.33984285593032837, "rewards/rejected": 0.7878473401069641, "step": 913 }, { "epoch": 0.49, "learning_rate": 9.774016447754035e-08, "logits/chosen": -2.2024576663970947, "logits/rejected": -2.0182859897613525, "logps/chosen": -51.075016021728516, "logps/rejected": -3.107330322265625, "loss": 0.3719, "rewards/accuracies": 1.0, "rewards/chosen": 1.4111241102218628, "rewards/margins": 0.7974808216094971, "rewards/rejected": 0.6136432886123657, "step": 914 }, { "epoch": 0.49, "learning_rate": 9.7733668877267e-08, "logits/chosen": -2.068962812423706, "logits/rejected": -2.061072587966919, "logps/chosen": -15.440961837768555, "logps/rejected": -6.255032539367676, "loss": 0.5891, "rewards/accuracies": 1.0, "rewards/chosen": 0.8691393136978149, "rewards/margins": 0.22015255689620972, "rewards/rejected": 0.6489867568016052, "step": 915 }, { "epoch": 0.49, "learning_rate": 9.772716417148031e-08, "logits/chosen": -2.036987066268921, "logits/rejected": -2.285893678665161, "logps/chosen": -2.269042730331421, "logps/rejected": -2.1458842754364014, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": 0.7584563493728638, "rewards/margins": -0.008204638957977295, "rewards/rejected": 0.7666609883308411, "step": 916 }, { "epoch": 0.49, "learning_rate": 9.772065036142113e-08, "logits/chosen": -2.046532154083252, "logits/rejected": -2.3601202964782715, "logps/chosen": -7.319670677185059, "logps/rejected": -22.929567337036133, "loss": 0.5659, "rewards/accuracies": 1.0, "rewards/chosen": 0.6279910206794739, "rewards/margins": 0.2730521261692047, "rewards/rejected": 0.35493889451026917, "step": 917 }, { "epoch": 0.5, "learning_rate": 9.771412744833199e-08, "logits/chosen": -2.050524950027466, "logits/rejected": -2.050558567047119, "logps/chosen": -2.489574432373047, "logps/rejected": -2.605492353439331, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.8816982507705688, "rewards/margins": 0.023641228675842285, "rewards/rejected": 0.8580570220947266, "step": 918 }, { "epoch": 0.5, "learning_rate": 9.770759543345719e-08, "logits/chosen": -2.0881717205047607, "logits/rejected": -2.085463285446167, "logps/chosen": -4.908847808837891, "logps/rejected": -4.450653553009033, "loss": 0.3576, "rewards/accuracies": 1.0, "rewards/chosen": 1.401240587234497, "rewards/margins": 0.8441790342330933, "rewards/rejected": 0.5570615530014038, "step": 919 }, { "epoch": 0.5, "learning_rate": 9.770105431804276e-08, "logits/chosen": -2.0006887912750244, "logits/rejected": -2.0014560222625732, "logps/chosen": -1.8585700988769531, "logps/rejected": -2.8468642234802246, "loss": 0.5495, "rewards/accuracies": 1.0, "rewards/chosen": 0.8934411406517029, "rewards/margins": 0.31141364574432373, "rewards/rejected": 0.5820274949073792, "step": 920 }, { "epoch": 0.5, "learning_rate": 9.769450410333643e-08, "logits/chosen": -1.998997688293457, "logits/rejected": -2.246061325073242, "logps/chosen": -1.115278959274292, "logps/rejected": -5.765890598297119, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": 0.7557571530342102, "rewards/margins": 0.14650243520736694, "rewards/rejected": 0.6092547178268433, "step": 921 }, { "epoch": 0.5, "learning_rate": 9.768794479058775e-08, "logits/chosen": -2.057034730911255, "logits/rejected": -2.2370553016662598, "logps/chosen": -1.2318789958953857, "logps/rejected": -2.3422787189483643, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 0.8408685922622681, "rewards/margins": -0.007153987884521484, "rewards/rejected": 0.8480225801467896, "step": 922 }, { "epoch": 0.5, "learning_rate": 9.768137638104792e-08, "logits/chosen": -2.0357630252838135, "logits/rejected": -2.2345733642578125, "logps/chosen": -0.956943154335022, "logps/rejected": -0.9687477946281433, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.8403709530830383, "rewards/margins": 0.0005705952644348145, "rewards/rejected": 0.8398003578186035, "step": 923 }, { "epoch": 0.5, "learning_rate": 9.767479887596988e-08, "logits/chosen": -2.081167221069336, "logits/rejected": -2.0790507793426514, "logps/chosen": -1.7303779125213623, "logps/rejected": -2.823363780975342, "loss": 0.5127, "rewards/accuracies": 1.0, "rewards/chosen": 1.062620997428894, "rewards/margins": 0.40087759494781494, "rewards/rejected": 0.6617434024810791, "step": 924 }, { "epoch": 0.5, "learning_rate": 9.76682122766084e-08, "logits/chosen": -2.0617218017578125, "logits/rejected": -2.236018180847168, "logps/chosen": -0.7462990283966064, "logps/rejected": -0.7860300540924072, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8272472620010376, "rewards/margins": 0.01364070177078247, "rewards/rejected": 0.8136065602302551, "step": 925 }, { "epoch": 0.5, "learning_rate": 9.766161658421985e-08, "logits/chosen": -2.1215872764587402, "logits/rejected": -2.1191017627716064, "logps/chosen": -3.5934159755706787, "logps/rejected": -2.371668815612793, "loss": 0.5054, "rewards/accuracies": 1.0, "rewards/chosen": 1.1300747394561768, "rewards/margins": 0.41899627447128296, "rewards/rejected": 0.7110784649848938, "step": 926 }, { "epoch": 0.5, "learning_rate": 9.765501180006244e-08, "logits/chosen": -2.092473268508911, "logits/rejected": -2.094409465789795, "logps/chosen": -3.2229104042053223, "logps/rejected": -2.7674055099487305, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 1.0524641275405884, "rewards/margins": 0.461708664894104, "rewards/rejected": 0.5907554626464844, "step": 927 }, { "epoch": 0.5, "learning_rate": 9.764839792539608e-08, "logits/chosen": -2.126511812210083, "logits/rejected": -2.234926223754883, "logps/chosen": -3.2245373725891113, "logps/rejected": -3.350128650665283, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.6019304990768433, "rewards/margins": 0.026477813720703125, "rewards/rejected": 0.5754526853561401, "step": 928 }, { "epoch": 0.5, "learning_rate": 9.764177496148238e-08, "logits/chosen": -2.096621036529541, "logits/rejected": -2.0864429473876953, "logps/chosen": -4.1077351570129395, "logps/rejected": -3.439213275909424, "loss": 0.5357, "rewards/accuracies": 1.0, "rewards/chosen": 0.9410751461982727, "rewards/margins": 0.34431177377700806, "rewards/rejected": 0.5967633724212646, "step": 929 }, { "epoch": 0.5, "learning_rate": 9.763514290958474e-08, "logits/chosen": -2.089453935623169, "logits/rejected": -2.3194119930267334, "logps/chosen": -2.879000425338745, "logps/rejected": -1.9576880931854248, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9400477409362793, "rewards/margins": 0.04994899034500122, "rewards/rejected": 0.8900987505912781, "step": 930 }, { "epoch": 0.5, "learning_rate": 9.762850177096826e-08, "logits/chosen": -2.1370458602905273, "logits/rejected": -2.1055617332458496, "logps/chosen": -18.924304962158203, "logps/rejected": -6.691982746124268, "loss": 0.51, "rewards/accuracies": 1.0, "rewards/chosen": 1.2019150257110596, "rewards/margins": 0.4076555371284485, "rewards/rejected": 0.7942594885826111, "step": 931 }, { "epoch": 0.5, "learning_rate": 9.762185154689978e-08, "logits/chosen": -2.0831058025360107, "logits/rejected": -2.0906667709350586, "logps/chosen": -3.4813637733459473, "logps/rejected": -2.6669044494628906, "loss": 0.5099, "rewards/accuracies": 1.0, "rewards/chosen": 0.9361553192138672, "rewards/margins": 0.40777307748794556, "rewards/rejected": 0.5283822417259216, "step": 932 }, { "epoch": 0.5, "learning_rate": 9.761519223864788e-08, "logits/chosen": -2.0547802448272705, "logits/rejected": -2.076812505722046, "logps/chosen": -31.98133087158203, "logps/rejected": -9.019020080566406, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": 0.9619216918945312, "rewards/margins": 0.14652478694915771, "rewards/rejected": 0.8153969049453735, "step": 933 }, { "epoch": 0.5, "learning_rate": 9.760852384748285e-08, "logits/chosen": -2.0637359619140625, "logits/rejected": -2.062314748764038, "logps/chosen": -4.828032970428467, "logps/rejected": -9.964527130126953, "loss": 0.4908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9182506799697876, "rewards/margins": 0.4563644528388977, "rewards/rejected": 0.4618862271308899, "step": 934 }, { "epoch": 0.5, "learning_rate": 9.760184637467677e-08, "logits/chosen": -2.078516960144043, "logits/rejected": -2.2575459480285645, "logps/chosen": -3.6733882427215576, "logps/rejected": -2.4420723915100098, "loss": 0.7107, "rewards/accuracies": 0.0, "rewards/chosen": 0.9028574824333191, "rewards/margins": -0.03472435474395752, "rewards/rejected": 0.9375818371772766, "step": 935 }, { "epoch": 0.5, "learning_rate": 9.759515982150336e-08, "logits/chosen": -2.071240186691284, "logits/rejected": -2.057960271835327, "logps/chosen": -7.114413261413574, "logps/rejected": -5.076229572296143, "loss": 0.3931, "rewards/accuracies": 1.0, "rewards/chosen": 1.2175843715667725, "rewards/margins": 0.7305835485458374, "rewards/rejected": 0.48700079321861267, "step": 936 }, { "epoch": 0.51, "learning_rate": 9.758846418923816e-08, "logits/chosen": -2.0118460655212402, "logits/rejected": -1.9559326171875, "logps/chosen": -37.984527587890625, "logps/rejected": -2.5870609283447266, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": 1.3787819147109985, "rewards/margins": 0.7582394480705261, "rewards/rejected": 0.6205424666404724, "step": 937 }, { "epoch": 0.51, "learning_rate": 9.758175947915839e-08, "logits/chosen": -2.093585252761841, "logits/rejected": -2.233511209487915, "logps/chosen": -4.763421535491943, "logps/rejected": -4.392482280731201, "loss": 0.7059, "rewards/accuracies": 0.0, "rewards/chosen": 0.8762456774711609, "rewards/margins": -0.02531224489212036, "rewards/rejected": 0.9015579223632812, "step": 938 }, { "epoch": 0.51, "learning_rate": 9.757504569254303e-08, "logits/chosen": -1.9447656869888306, "logits/rejected": -2.294382333755493, "logps/chosen": -1.761037826538086, "logps/rejected": -1.6921617984771729, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.8458554148674011, "rewards/margins": 0.00570368766784668, "rewards/rejected": 0.8401517271995544, "step": 939 }, { "epoch": 0.51, "learning_rate": 9.756832283067278e-08, "logits/chosen": -1.9731160402297974, "logits/rejected": -1.9756273031234741, "logps/chosen": -7.784238338470459, "logps/rejected": -1.5212798118591309, "loss": 0.2817, "rewards/accuracies": 1.0, "rewards/chosen": 1.6486272811889648, "rewards/margins": 1.1229076385498047, "rewards/rejected": 0.5257195830345154, "step": 940 }, { "epoch": 0.51, "learning_rate": 9.756159089483004e-08, "logits/chosen": -2.003634452819824, "logits/rejected": -2.0121066570281982, "logps/chosen": -2.6988205909729004, "logps/rejected": -2.4235479831695557, "loss": 0.4327, "rewards/accuracies": 1.0, "rewards/chosen": 1.3431395292282104, "rewards/margins": 0.6134490370750427, "rewards/rejected": 0.7296904921531677, "step": 941 }, { "epoch": 0.51, "learning_rate": 9.755484988629901e-08, "logits/chosen": -1.9315764904022217, "logits/rejected": -2.2005865573883057, "logps/chosen": -1.1906394958496094, "logps/rejected": -1.3268144130706787, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.8509364128112793, "rewards/margins": -0.0033397674560546875, "rewards/rejected": 0.854276180267334, "step": 942 }, { "epoch": 0.51, "learning_rate": 9.754809980636555e-08, "logits/chosen": -2.1430301666259766, "logits/rejected": -2.1469485759735107, "logps/chosen": -4.06695556640625, "logps/rejected": -3.522700786590576, "loss": 0.4824, "rewards/accuracies": 1.0, "rewards/chosen": 1.0691970586776733, "rewards/margins": 0.47807687520980835, "rewards/rejected": 0.591120183467865, "step": 943 }, { "epoch": 0.51, "learning_rate": 9.754134065631731e-08, "logits/chosen": -1.9700226783752441, "logits/rejected": -1.9499695301055908, "logps/chosen": -20.5800724029541, "logps/rejected": -2.2520604133605957, "loss": 0.401, "rewards/accuracies": 1.0, "rewards/chosen": 1.4168996810913086, "rewards/margins": 0.7067172527313232, "rewards/rejected": 0.7101824283599854, "step": 944 }, { "epoch": 0.51, "learning_rate": 9.753457243744364e-08, "logits/chosen": -2.096733570098877, "logits/rejected": -2.0997438430786133, "logps/chosen": -4.3796844482421875, "logps/rejected": -0.681847870349884, "loss": 0.6041, "rewards/accuracies": 1.0, "rewards/chosen": 1.1897220611572266, "rewards/margins": 0.1867821216583252, "rewards/rejected": 1.0029399394989014, "step": 945 }, { "epoch": 0.51, "learning_rate": 9.75277951510356e-08, "logits/chosen": -2.107194185256958, "logits/rejected": -2.235403060913086, "logps/chosen": -2.8371975421905518, "logps/rejected": -1.7434442043304443, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 0.9804770350456238, "rewards/margins": 0.05174660682678223, "rewards/rejected": 0.9287304282188416, "step": 946 }, { "epoch": 0.51, "learning_rate": 9.752100879838602e-08, "logits/chosen": -2.1061744689941406, "logits/rejected": -2.2437338829040527, "logps/chosen": -0.47504669427871704, "logps/rejected": -0.5334230661392212, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.7972463369369507, "rewards/margins": 0.011256158351898193, "rewards/rejected": 0.7859901785850525, "step": 947 }, { "epoch": 0.51, "learning_rate": 9.751421338078945e-08, "logits/chosen": -2.138944625854492, "logits/rejected": -2.1363396644592285, "logps/chosen": -3.0538246631622314, "logps/rejected": -5.406630516052246, "loss": 0.6347, "rewards/accuracies": 1.0, "rewards/chosen": 0.8866614699363708, "rewards/margins": 0.1205105185508728, "rewards/rejected": 0.766150951385498, "step": 948 }, { "epoch": 0.51, "learning_rate": 9.750740889954215e-08, "logits/chosen": -2.0769758224487305, "logits/rejected": -2.250910520553589, "logps/chosen": -0.8440844416618347, "logps/rejected": -0.9513024091720581, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8430793881416321, "rewards/margins": 0.022211551666259766, "rewards/rejected": 0.8208678364753723, "step": 949 }, { "epoch": 0.51, "learning_rate": 9.75005953559421e-08, "logits/chosen": -2.0084407329559326, "logits/rejected": -2.234206199645996, "logps/chosen": -1.3291703462600708, "logps/rejected": -5.559142589569092, "loss": 0.5303, "rewards/accuracies": 1.0, "rewards/chosen": 1.0595892667770386, "rewards/margins": 0.35751694440841675, "rewards/rejected": 0.7020723223686218, "step": 950 }, { "epoch": 0.51, "learning_rate": 9.749377275128905e-08, "logits/chosen": -2.096778631210327, "logits/rejected": -2.2404863834381104, "logps/chosen": -1.1827952861785889, "logps/rejected": -1.2617219686508179, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.8642493486404419, "rewards/margins": 0.026068508625030518, "rewards/rejected": 0.8381808400154114, "step": 951 }, { "epoch": 0.51, "learning_rate": 9.748694108688446e-08, "logits/chosen": -2.1218888759613037, "logits/rejected": -2.1216187477111816, "logps/chosen": -3.5501630306243896, "logps/rejected": -2.748181104660034, "loss": 0.3508, "rewards/accuracies": 1.0, "rewards/chosen": 1.4314281940460205, "rewards/margins": 0.8670035004615784, "rewards/rejected": 0.5644246935844421, "step": 952 }, { "epoch": 0.51, "learning_rate": 9.748010036403151e-08, "logits/chosen": -1.9477365016937256, "logits/rejected": -1.9466911554336548, "logps/chosen": -0.7651207447052002, "logps/rejected": -4.778138637542725, "loss": 0.5457, "rewards/accuracies": 1.0, "rewards/chosen": 0.9217633605003357, "rewards/margins": 0.3205639719963074, "rewards/rejected": 0.6011993885040283, "step": 953 }, { "epoch": 0.51, "learning_rate": 9.74732505840351e-08, "logits/chosen": -2.1370034217834473, "logits/rejected": -2.3223843574523926, "logps/chosen": -1.8169406652450562, "logps/rejected": -1.777172327041626, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9774591326713562, "rewards/margins": 0.01724565029144287, "rewards/rejected": 0.9602134823799133, "step": 954 }, { "epoch": 0.52, "learning_rate": 9.74663917482019e-08, "logits/chosen": -2.081254243850708, "logits/rejected": -2.2458372116088867, "logps/chosen": -2.2896032333374023, "logps/rejected": -2.359961986541748, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.5981122255325317, "rewards/margins": 0.019565045833587646, "rewards/rejected": 0.5785471796989441, "step": 955 }, { "epoch": 0.52, "learning_rate": 9.745952385784024e-08, "logits/chosen": -2.1424434185028076, "logits/rejected": -2.173251152038574, "logps/chosen": -3.5111258029937744, "logps/rejected": -14.034854888916016, "loss": 0.4088, "rewards/accuracies": 1.0, "rewards/chosen": 1.2099417448043823, "rewards/margins": 0.6832800507545471, "rewards/rejected": 0.5266616940498352, "step": 956 }, { "epoch": 0.52, "learning_rate": 9.745264691426025e-08, "logits/chosen": -1.9906624555587769, "logits/rejected": -1.9928855895996094, "logps/chosen": -3.028193712234497, "logps/rejected": -6.046070098876953, "loss": 0.5187, "rewards/accuracies": 1.0, "rewards/chosen": 0.9210468530654907, "rewards/margins": 0.3859073519706726, "rewards/rejected": 0.5351395010948181, "step": 957 }, { "epoch": 0.52, "learning_rate": 9.744576091877374e-08, "logits/chosen": -2.0599541664123535, "logits/rejected": -2.0693395137786865, "logps/chosen": -3.799454689025879, "logps/rejected": -11.514328002929688, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 0.837713897228241, "rewards/margins": 0.06614255905151367, "rewards/rejected": 0.7715713381767273, "step": 958 }, { "epoch": 0.52, "learning_rate": 9.743886587269426e-08, "logits/chosen": -2.0735318660736084, "logits/rejected": -2.260887861251831, "logps/chosen": -3.914376735687256, "logps/rejected": -14.977937698364258, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9393644332885742, "rewards/margins": 0.051280200481414795, "rewards/rejected": 0.8880842328071594, "step": 959 }, { "epoch": 0.52, "learning_rate": 9.743196177733708e-08, "logits/chosen": -2.049013614654541, "logits/rejected": -2.2523298263549805, "logps/chosen": -12.936803817749023, "logps/rejected": -6.80341100692749, "loss": 0.7833, "rewards/accuracies": 0.0, "rewards/chosen": 0.743852436542511, "rewards/margins": -0.17286431789398193, "rewards/rejected": 0.9167167544364929, "step": 960 }, { "epoch": 0.52, "learning_rate": 9.742504863401921e-08, "logits/chosen": -2.0572667121887207, "logits/rejected": -2.2656667232513428, "logps/chosen": -1.4726521968841553, "logps/rejected": -1.5545680522918701, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.8616930246353149, "rewards/margins": 0.03146249055862427, "rewards/rejected": 0.8302305340766907, "step": 961 }, { "epoch": 0.52, "learning_rate": 9.741812644405938e-08, "logits/chosen": -2.0582847595214844, "logits/rejected": -2.058906316757202, "logps/chosen": -4.397491455078125, "logps/rejected": -3.6038646697998047, "loss": 0.5803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8689908981323242, "rewards/margins": 0.24009376764297485, "rewards/rejected": 0.6288971304893494, "step": 962 }, { "epoch": 0.52, "learning_rate": 9.741119520877803e-08, "logits/chosen": -2.1049869060516357, "logits/rejected": -2.105862617492676, "logps/chosen": -0.6093935966491699, "logps/rejected": -3.2064528465270996, "loss": 0.5586, "rewards/accuracies": 1.0, "rewards/chosen": 0.9498659372329712, "rewards/margins": 0.2899898290634155, "rewards/rejected": 0.6598761081695557, "step": 963 }, { "epoch": 0.52, "learning_rate": 9.740425492949736e-08, "logits/chosen": -2.048743724822998, "logits/rejected": -2.260253667831421, "logps/chosen": -2.5791964530944824, "logps/rejected": -2.560448169708252, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.8465628027915955, "rewards/margins": 0.014224827289581299, "rewards/rejected": 0.8323379755020142, "step": 964 }, { "epoch": 0.52, "learning_rate": 9.739730560754125e-08, "logits/chosen": -2.075178384780884, "logits/rejected": -2.2607359886169434, "logps/chosen": -32.66387939453125, "logps/rejected": -32.75009536743164, "loss": 0.7698, "rewards/accuracies": 0.0, "rewards/chosen": 0.14618530869483948, "rewards/margins": -0.14780348539352417, "rewards/rejected": 0.29398879408836365, "step": 965 }, { "epoch": 0.52, "learning_rate": 9.739034724423533e-08, "logits/chosen": -2.1318206787109375, "logits/rejected": -2.118159294128418, "logps/chosen": -10.402931213378906, "logps/rejected": -2.4567763805389404, "loss": 0.5776, "rewards/accuracies": 1.0, "rewards/chosen": 1.0172837972640991, "rewards/margins": 0.2461267113685608, "rewards/rejected": 0.7711570858955383, "step": 966 }, { "epoch": 0.52, "learning_rate": 9.738337984090697e-08, "logits/chosen": -2.055633783340454, "logits/rejected": -2.2839887142181396, "logps/chosen": -3.1063790321350098, "logps/rejected": -8.176647186279297, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 0.723142683506012, "rewards/margins": 0.1989811658859253, "rewards/rejected": 0.5241615176200867, "step": 967 }, { "epoch": 0.52, "learning_rate": 9.737640339888525e-08, "logits/chosen": -2.108154535293579, "logits/rejected": -2.1061646938323975, "logps/chosen": -6.385035991668701, "logps/rejected": -5.478148460388184, "loss": 0.3379, "rewards/accuracies": 1.0, "rewards/chosen": 1.3037060499191284, "rewards/margins": 0.9111753702163696, "rewards/rejected": 0.3925306499004364, "step": 968 }, { "epoch": 0.52, "learning_rate": 9.736941791950096e-08, "logits/chosen": -2.2297847270965576, "logits/rejected": -2.315162420272827, "logps/chosen": -1.6446794271469116, "logps/rejected": -1.6212263107299805, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 0.9906787276268005, "rewards/margins": -0.008857786655426025, "rewards/rejected": 0.9995365142822266, "step": 969 }, { "epoch": 0.52, "learning_rate": 9.736242340408663e-08, "logits/chosen": -2.1378087997436523, "logits/rejected": -2.2717690467834473, "logps/chosen": -0.6324343681335449, "logps/rejected": -0.6814890503883362, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.9058456420898438, "rewards/margins": 0.012666106224060059, "rewards/rejected": 0.8931795358657837, "step": 970 }, { "epoch": 0.52, "learning_rate": 9.735541985397651e-08, "logits/chosen": -2.0236942768096924, "logits/rejected": -2.281653642654419, "logps/chosen": -1.0858207941055298, "logps/rejected": -1.036466121673584, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.8031660318374634, "rewards/margins": 0.009337365627288818, "rewards/rejected": 0.7938286662101746, "step": 971 }, { "epoch": 0.52, "learning_rate": 9.734840727050659e-08, "logits/chosen": -2.0273990631103516, "logits/rejected": -2.262894630432129, "logps/chosen": -2.253662109375, "logps/rejected": -2.5074076652526855, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.9749213457107544, "rewards/margins": 0.02427572011947632, "rewards/rejected": 0.9506456255912781, "step": 972 }, { "epoch": 0.52, "learning_rate": 9.734138565501454e-08, "logits/chosen": -2.055157423019409, "logits/rejected": -2.105215311050415, "logps/chosen": -7.616191387176514, "logps/rejected": -15.466064453125, "loss": 0.3029, "rewards/accuracies": 1.0, "rewards/chosen": 1.2592829465866089, "rewards/margins": 1.0390772819519043, "rewards/rejected": 0.22020569443702698, "step": 973 }, { "epoch": 0.53, "learning_rate": 9.73343550088398e-08, "logits/chosen": -2.142354965209961, "logits/rejected": -2.1442339420318604, "logps/chosen": -4.920438289642334, "logps/rejected": -9.154436111450195, "loss": 0.5036, "rewards/accuracies": 1.0, "rewards/chosen": 1.1924227476119995, "rewards/margins": 0.42372214794158936, "rewards/rejected": 0.7687005996704102, "step": 974 }, { "epoch": 0.53, "learning_rate": 9.732731533332349e-08, "logits/chosen": -2.082409143447876, "logits/rejected": -2.08685040473938, "logps/chosen": -11.43187141418457, "logps/rejected": -1.9543638229370117, "loss": 0.4479, "rewards/accuracies": 1.0, "rewards/chosen": 1.2559168338775635, "rewards/margins": 0.5709708333015442, "rewards/rejected": 0.6849460005760193, "step": 975 }, { "epoch": 0.53, "learning_rate": 9.732026662980849e-08, "logits/chosen": -2.0397956371307373, "logits/rejected": -2.2291858196258545, "logps/chosen": -6.700671672821045, "logps/rejected": -1.070335865020752, "loss": 0.7656, "rewards/accuracies": 0.0, "rewards/chosen": 0.7735952138900757, "rewards/margins": -0.13997673988342285, "rewards/rejected": 0.9135719537734985, "step": 976 }, { "epoch": 0.53, "learning_rate": 9.731320889963939e-08, "logits/chosen": -2.0205280780792236, "logits/rejected": -2.031432867050171, "logps/chosen": -3.491196393966675, "logps/rejected": -1.7854421138763428, "loss": 0.5107, "rewards/accuracies": 1.0, "rewards/chosen": 1.0550217628479004, "rewards/margins": 0.4057340621948242, "rewards/rejected": 0.6492877006530762, "step": 977 }, { "epoch": 0.53, "learning_rate": 9.730614214416249e-08, "logits/chosen": -2.048166513442993, "logits/rejected": -2.0449206829071045, "logps/chosen": -10.45521354675293, "logps/rejected": -6.325465679168701, "loss": 0.4343, "rewards/accuracies": 1.0, "rewards/chosen": 1.2890771627426147, "rewards/margins": 0.6089152693748474, "rewards/rejected": 0.6801618933677673, "step": 978 }, { "epoch": 0.53, "learning_rate": 9.729906636472583e-08, "logits/chosen": -2.058786153793335, "logits/rejected": -2.259305238723755, "logps/chosen": -0.4147983491420746, "logps/rejected": -0.3892306089401245, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.6463221907615662, "rewards/margins": 0.018553078174591064, "rewards/rejected": 0.6277691125869751, "step": 979 }, { "epoch": 0.53, "learning_rate": 9.729198156267915e-08, "logits/chosen": -2.0415520668029785, "logits/rejected": -2.2632486820220947, "logps/chosen": -1.9109959602355957, "logps/rejected": -2.028832197189331, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": 0.91558837890625, "rewards/margins": 0.04422217607498169, "rewards/rejected": 0.8713662028312683, "step": 980 }, { "epoch": 0.53, "learning_rate": 9.728488773937394e-08, "logits/chosen": -2.056800127029419, "logits/rejected": -2.0349979400634766, "logps/chosen": -8.332571029663086, "logps/rejected": -3.5641274452209473, "loss": 0.4189, "rewards/accuracies": 1.0, "rewards/chosen": 1.2817872762680054, "rewards/margins": 0.6533133387565613, "rewards/rejected": 0.6284739375114441, "step": 981 }, { "epoch": 0.53, "learning_rate": 9.727778489616338e-08, "logits/chosen": -2.058838367462158, "logits/rejected": -2.0597074031829834, "logps/chosen": -4.223777770996094, "logps/rejected": -2.7340941429138184, "loss": 0.398, "rewards/accuracies": 1.0, "rewards/chosen": 1.6136085987091064, "rewards/margins": 0.7158266305923462, "rewards/rejected": 0.8977819681167603, "step": 982 }, { "epoch": 0.53, "learning_rate": 9.727067303440238e-08, "logits/chosen": -2.045163154602051, "logits/rejected": -2.045414686203003, "logps/chosen": -6.180862903594971, "logps/rejected": -2.5747156143188477, "loss": 0.3176, "rewards/accuracies": 1.0, "rewards/chosen": 1.5512140989303589, "rewards/margins": 0.9839776754379272, "rewards/rejected": 0.5672364234924316, "step": 983 }, { "epoch": 0.53, "learning_rate": 9.726355215544757e-08, "logits/chosen": -1.9716322422027588, "logits/rejected": -1.964288353919983, "logps/chosen": -12.868237495422363, "logps/rejected": -11.641912460327148, "loss": 0.5904, "rewards/accuracies": 1.0, "rewards/chosen": 1.1631766557693481, "rewards/margins": 0.21724671125411987, "rewards/rejected": 0.9459299445152283, "step": 984 }, { "epoch": 0.53, "learning_rate": 9.725642226065733e-08, "logits/chosen": -2.116010904312134, "logits/rejected": -2.1144630908966064, "logps/chosen": -10.246057510375977, "logps/rejected": -1.4603040218353271, "loss": 0.5654, "rewards/accuracies": 1.0, "rewards/chosen": 1.2126268148422241, "rewards/margins": 0.2741304039955139, "rewards/rejected": 0.9384964108467102, "step": 985 }, { "epoch": 0.53, "learning_rate": 9.724928335139173e-08, "logits/chosen": -2.087568998336792, "logits/rejected": -2.1139941215515137, "logps/chosen": -9.052865028381348, "logps/rejected": -8.513121604919434, "loss": 0.5412, "rewards/accuracies": 1.0, "rewards/chosen": 1.2923682928085327, "rewards/margins": 0.33109670877456665, "rewards/rejected": 0.9612715840339661, "step": 986 }, { "epoch": 0.53, "learning_rate": 9.724213542901254e-08, "logits/chosen": -1.9615670442581177, "logits/rejected": -1.9622061252593994, "logps/chosen": -0.43824896216392517, "logps/rejected": -6.299709320068359, "loss": 0.5595, "rewards/accuracies": 1.0, "rewards/chosen": 0.7980244159698486, "rewards/margins": 0.2879346013069153, "rewards/rejected": 0.5100898146629333, "step": 987 }, { "epoch": 0.53, "learning_rate": 9.72349784948833e-08, "logits/chosen": -2.1421051025390625, "logits/rejected": -2.2398650646209717, "logps/chosen": -4.152554988861084, "logps/rejected": -4.279883861541748, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.9767860770225525, "rewards/margins": 0.03278219699859619, "rewards/rejected": 0.9440038800239563, "step": 988 }, { "epoch": 0.53, "learning_rate": 9.722781255036922e-08, "logits/chosen": -2.0330123901367188, "logits/rejected": -2.039158344268799, "logps/chosen": -5.694128036499023, "logps/rejected": -8.335450172424316, "loss": 0.5302, "rewards/accuracies": 1.0, "rewards/chosen": 1.001075267791748, "rewards/margins": 0.3576619029045105, "rewards/rejected": 0.6434133648872375, "step": 989 }, { "epoch": 0.53, "learning_rate": 9.722063759683727e-08, "logits/chosen": -2.1375463008880615, "logits/rejected": -2.133941411972046, "logps/chosen": -5.341357231140137, "logps/rejected": -6.986926078796387, "loss": 0.4604, "rewards/accuracies": 1.0, "rewards/chosen": 1.3032996654510498, "rewards/margins": 0.5367016196250916, "rewards/rejected": 0.7665980458259583, "step": 990 }, { "epoch": 0.53, "learning_rate": 9.721345363565611e-08, "logits/chosen": -1.9916929006576538, "logits/rejected": -2.0012357234954834, "logps/chosen": -2.423828363418579, "logps/rejected": -2.0887608528137207, "loss": 0.4959, "rewards/accuracies": 1.0, "rewards/chosen": 1.1622748374938965, "rewards/margins": 0.4433257579803467, "rewards/rejected": 0.7189490795135498, "step": 991 }, { "epoch": 0.54, "learning_rate": 9.720626066819615e-08, "logits/chosen": -2.0993051528930664, "logits/rejected": -2.2905492782592773, "logps/chosen": -10.46972942352295, "logps/rejected": -8.609619140625, "loss": 0.7684, "rewards/accuracies": 0.0, "rewards/chosen": 0.7108694911003113, "rewards/margins": -0.14518463611602783, "rewards/rejected": 0.8560541272163391, "step": 992 }, { "epoch": 0.54, "learning_rate": 9.719905869582946e-08, "logits/chosen": -2.116579294204712, "logits/rejected": -2.228116750717163, "logps/chosen": -7.774016857147217, "logps/rejected": -7.077712059020996, "loss": 0.6513, "rewards/accuracies": 1.0, "rewards/chosen": 0.8392738699913025, "rewards/margins": 0.08557760715484619, "rewards/rejected": 0.7536962628364563, "step": 993 }, { "epoch": 0.54, "learning_rate": 9.719184771992987e-08, "logits/chosen": -2.000149726867676, "logits/rejected": -2.3002469539642334, "logps/chosen": -3.031586170196533, "logps/rejected": -3.2437655925750732, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": 0.9634460806846619, "rewards/margins": -0.006366908550262451, "rewards/rejected": 0.9698129892349243, "step": 994 }, { "epoch": 0.54, "learning_rate": 9.718462774187295e-08, "logits/chosen": -2.0010695457458496, "logits/rejected": -2.0107901096343994, "logps/chosen": -3.2009215354919434, "logps/rejected": -1.7298740148544312, "loss": 0.4365, "rewards/accuracies": 1.0, "rewards/chosen": 1.2798858880996704, "rewards/margins": 0.6026763319969177, "rewards/rejected": 0.6772095561027527, "step": 995 }, { "epoch": 0.54, "learning_rate": 9.717739876303594e-08, "logits/chosen": -2.0811476707458496, "logits/rejected": -2.058858871459961, "logps/chosen": -14.579436302185059, "logps/rejected": -2.7944796085357666, "loss": 0.4278, "rewards/accuracies": 1.0, "rewards/chosen": 1.25800359249115, "rewards/margins": 0.6275206208229065, "rewards/rejected": 0.6304829716682434, "step": 996 }, { "epoch": 0.54, "learning_rate": 9.717016078479781e-08, "logits/chosen": -2.1087849140167236, "logits/rejected": -2.1070098876953125, "logps/chosen": -1.9120252132415771, "logps/rejected": -4.503766059875488, "loss": 0.6336, "rewards/accuracies": 1.0, "rewards/chosen": 0.958484947681427, "rewards/margins": 0.1229482889175415, "rewards/rejected": 0.8355366587638855, "step": 997 }, { "epoch": 0.54, "learning_rate": 9.716291380853927e-08, "logits/chosen": -2.0625784397125244, "logits/rejected": -2.2758963108062744, "logps/chosen": -3.8243138790130615, "logps/rejected": -3.820145845413208, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.7183344960212708, "rewards/margins": -0.010754764080047607, "rewards/rejected": 0.7290892601013184, "step": 998 }, { "epoch": 0.54, "learning_rate": 9.71556578356427e-08, "logits/chosen": -2.172163963317871, "logits/rejected": -2.156271457672119, "logps/chosen": -3.2053799629211426, "logps/rejected": -7.1982035636901855, "loss": 0.454, "rewards/accuracies": 1.0, "rewards/chosen": 1.2057838439941406, "rewards/margins": 0.5540573596954346, "rewards/rejected": 0.651726484298706, "step": 999 }, { "epoch": 0.54, "learning_rate": 9.714839286749226e-08, "logits/chosen": -1.9743486642837524, "logits/rejected": -1.9789971113204956, "logps/chosen": -4.2488861083984375, "logps/rejected": -2.60947322845459, "loss": 0.488, "rewards/accuracies": 1.0, "rewards/chosen": 1.1010221242904663, "rewards/margins": 0.46342819929122925, "rewards/rejected": 0.6375939249992371, "step": 1000 }, { "epoch": 0.54, "learning_rate": 9.714111890547376e-08, "logits/chosen": -1.9274214506149292, "logits/rejected": -1.9363256692886353, "logps/chosen": -1.842676043510437, "logps/rejected": -2.435797691345215, "loss": 0.513, "rewards/accuracies": 1.0, "rewards/chosen": 1.0204206705093384, "rewards/margins": 0.3999948501586914, "rewards/rejected": 0.620425820350647, "step": 1001 }, { "epoch": 0.54, "learning_rate": 9.713383595097477e-08, "logits/chosen": -2.1113040447235107, "logits/rejected": -2.265570640563965, "logps/chosen": -2.970564126968384, "logps/rejected": -2.8070437908172607, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.6061927080154419, "rewards/margins": 0.0052890777587890625, "rewards/rejected": 0.6009036302566528, "step": 1002 }, { "epoch": 0.54, "learning_rate": 9.712654400538456e-08, "logits/chosen": -2.0197396278381348, "logits/rejected": -2.0224733352661133, "logps/chosen": -3.8626816272735596, "logps/rejected": -0.7570096254348755, "loss": 0.5596, "rewards/accuracies": 1.0, "rewards/chosen": 1.1374577283859253, "rewards/margins": 0.28760915994644165, "rewards/rejected": 0.8498485684394836, "step": 1003 }, { "epoch": 0.54, "learning_rate": 9.711924307009413e-08, "logits/chosen": -2.0870554447174072, "logits/rejected": -2.3252334594726562, "logps/chosen": -0.46279028058052063, "logps/rejected": -0.5006131529808044, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 1.0077792406082153, "rewards/margins": 0.013739407062530518, "rewards/rejected": 0.9940398335456848, "step": 1004 }, { "epoch": 0.54, "learning_rate": 9.711193314649616e-08, "logits/chosen": -1.9317989349365234, "logits/rejected": -2.2475452423095703, "logps/chosen": -0.7763012647628784, "logps/rejected": -0.8044531345367432, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.8073943257331848, "rewards/margins": 0.03779047727584839, "rewards/rejected": 0.7696038484573364, "step": 1005 }, { "epoch": 0.54, "learning_rate": 9.710461423598508e-08, "logits/chosen": -2.043570041656494, "logits/rejected": -2.044024705886841, "logps/chosen": -1.1825343370437622, "logps/rejected": -2.101205348968506, "loss": 0.5367, "rewards/accuracies": 1.0, "rewards/chosen": 1.0161088705062866, "rewards/margins": 0.34188634157180786, "rewards/rejected": 0.6742225289344788, "step": 1006 }, { "epoch": 0.54, "learning_rate": 9.709728633995702e-08, "logits/chosen": -2.01261305809021, "logits/rejected": -2.252711296081543, "logps/chosen": -0.8710815906524658, "logps/rejected": -0.902773380279541, "loss": 0.661, "rewards/accuracies": 1.0, "rewards/chosen": 0.9404675364494324, "rewards/margins": 0.06536281108856201, "rewards/rejected": 0.8751047253608704, "step": 1007 }, { "epoch": 0.54, "learning_rate": 9.708994945980982e-08, "logits/chosen": -2.073986530303955, "logits/rejected": -2.2428479194641113, "logps/chosen": -0.7876303195953369, "logps/rejected": -3.6836915016174316, "loss": 0.6104, "rewards/accuracies": 1.0, "rewards/chosen": 0.946736752986908, "rewards/margins": 0.1730024814605713, "rewards/rejected": 0.7737342715263367, "step": 1008 }, { "epoch": 0.54, "learning_rate": 9.708260359694303e-08, "logits/chosen": -2.0119025707244873, "logits/rejected": -2.005270004272461, "logps/chosen": -16.519439697265625, "logps/rejected": -6.245830059051514, "loss": 0.4194, "rewards/accuracies": 1.0, "rewards/chosen": 1.3592909574508667, "rewards/margins": 0.651878297328949, "rewards/rejected": 0.7074126601219177, "step": 1009 }, { "epoch": 0.54, "learning_rate": 9.707524875275794e-08, "logits/chosen": -2.021423816680908, "logits/rejected": -2.309858798980713, "logps/chosen": -0.8609433770179749, "logps/rejected": -0.8059597015380859, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.8231417536735535, "rewards/margins": -0.0028905272483825684, "rewards/rejected": 0.826032280921936, "step": 1010 }, { "epoch": 0.55, "learning_rate": 9.706788492865751e-08, "logits/chosen": -2.083853244781494, "logits/rejected": -2.2834553718566895, "logps/chosen": -10.927928924560547, "logps/rejected": -0.5585795044898987, "loss": 0.8138, "rewards/accuracies": 0.0, "rewards/chosen": 0.6929529309272766, "rewards/margins": -0.22828465700149536, "rewards/rejected": 0.921237587928772, "step": 1011 }, { "epoch": 0.55, "learning_rate": 9.706051212604648e-08, "logits/chosen": -1.9304028749465942, "logits/rejected": -1.9297199249267578, "logps/chosen": -1.6605737209320068, "logps/rejected": -1.5661072731018066, "loss": 0.7052, "rewards/accuracies": 0.0, "rewards/chosen": 0.9201754927635193, "rewards/margins": -0.0239332914352417, "rewards/rejected": 0.944108784198761, "step": 1012 }, { "epoch": 0.55, "learning_rate": 9.705313034633123e-08, "logits/chosen": -2.268979787826538, "logits/rejected": -2.1565418243408203, "logps/chosen": -42.1068229675293, "logps/rejected": -3.714320659637451, "loss": 0.5158, "rewards/accuracies": 1.0, "rewards/chosen": 1.293008804321289, "rewards/margins": 0.3930557370185852, "rewards/rejected": 0.8999530673027039, "step": 1013 }, { "epoch": 0.55, "learning_rate": 9.704573959091988e-08, "logits/chosen": -2.0478515625, "logits/rejected": -2.2096965312957764, "logps/chosen": -1.0953052043914795, "logps/rejected": -1.1280854940414429, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.8811832666397095, "rewards/margins": -0.004952609539031982, "rewards/rejected": 0.8861358761787415, "step": 1014 }, { "epoch": 0.55, "learning_rate": 9.70383398612223e-08, "logits/chosen": -2.1264898777008057, "logits/rejected": -2.292572021484375, "logps/chosen": -34.00345993041992, "logps/rejected": -8.159066200256348, "loss": 1.1054, "rewards/accuracies": 0.0, "rewards/chosen": 0.21336936950683594, "rewards/margins": -0.7032418251037598, "rewards/rejected": 0.9166111946105957, "step": 1015 }, { "epoch": 0.55, "learning_rate": 9.703093115864999e-08, "logits/chosen": -2.1025712490081787, "logits/rejected": -2.1069648265838623, "logps/chosen": -2.0660665035247803, "logps/rejected": -3.77579665184021, "loss": 0.4412, "rewards/accuracies": 1.0, "rewards/chosen": 1.1618446111679077, "rewards/margins": 0.5895355939865112, "rewards/rejected": 0.5723090171813965, "step": 1016 }, { "epoch": 0.55, "learning_rate": 9.702351348461623e-08, "logits/chosen": -2.087648868560791, "logits/rejected": -2.0283901691436768, "logps/chosen": -15.28567123413086, "logps/rejected": -19.074703216552734, "loss": 0.3429, "rewards/accuracies": 1.0, "rewards/chosen": 1.439019799232483, "rewards/margins": 0.8940185904502869, "rewards/rejected": 0.545001208782196, "step": 1017 }, { "epoch": 0.55, "learning_rate": 9.701608684053599e-08, "logits/chosen": -2.084407329559326, "logits/rejected": -2.292717695236206, "logps/chosen": -0.8155093193054199, "logps/rejected": -0.8566999435424805, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9051069617271423, "rewards/margins": 0.005539894104003906, "rewards/rejected": 0.8995670676231384, "step": 1018 }, { "epoch": 0.55, "learning_rate": 9.700865122782597e-08, "logits/chosen": -2.1483805179595947, "logits/rejected": -2.118917942047119, "logps/chosen": -25.06488609313965, "logps/rejected": -4.029162883758545, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 1.3770735263824463, "rewards/margins": 0.8097599148750305, "rewards/rejected": 0.5673136115074158, "step": 1019 }, { "epoch": 0.55, "learning_rate": 9.700120664790454e-08, "logits/chosen": -2.038398504257202, "logits/rejected": -2.0424349308013916, "logps/chosen": -3.1913180351257324, "logps/rejected": -3.699183225631714, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": 1.3668012619018555, "rewards/margins": 0.8179931640625, "rewards/rejected": 0.5488080978393555, "step": 1020 }, { "epoch": 0.55, "learning_rate": 9.69937531021918e-08, "logits/chosen": -2.1108033657073975, "logits/rejected": -2.2592408657073975, "logps/chosen": -9.649446487426758, "logps/rejected": -9.847236633300781, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.812154233455658, "rewards/margins": 0.009389162063598633, "rewards/rejected": 0.8027650713920593, "step": 1021 }, { "epoch": 0.55, "learning_rate": 9.698629059210959e-08, "logits/chosen": -2.17326283454895, "logits/rejected": -2.1737191677093506, "logps/chosen": -1.02232825756073, "logps/rejected": -2.0415103435516357, "loss": 0.5651, "rewards/accuracies": 1.0, "rewards/chosen": 1.040291428565979, "rewards/margins": 0.2749220132827759, "rewards/rejected": 0.7653694152832031, "step": 1022 }, { "epoch": 0.55, "learning_rate": 9.69788191190814e-08, "logits/chosen": -2.050492763519287, "logits/rejected": -2.2118115425109863, "logps/chosen": -0.9081387519836426, "logps/rejected": -0.9353636503219604, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.7627133131027222, "rewards/margins": 0.02353358268737793, "rewards/rejected": 0.7391797304153442, "step": 1023 }, { "epoch": 0.55, "learning_rate": 9.697133868453249e-08, "logits/chosen": -2.0598931312561035, "logits/rejected": -2.173251152038574, "logps/chosen": -0.9075214266777039, "logps/rejected": -0.883769690990448, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.8095809817314148, "rewards/margins": 0.027748048305511475, "rewards/rejected": 0.7818329334259033, "step": 1024 }, { "epoch": 0.55, "learning_rate": 9.696384928988978e-08, "logits/chosen": -2.0470547676086426, "logits/rejected": -2.2165608406066895, "logps/chosen": -1.7766196727752686, "logps/rejected": -1.7206122875213623, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 0.8931631445884705, "rewards/margins": -0.0217629075050354, "rewards/rejected": 0.9149260520935059, "step": 1025 }, { "epoch": 0.55, "learning_rate": 9.695635093658193e-08, "logits/chosen": -2.1038341522216797, "logits/rejected": -2.1945416927337646, "logps/chosen": -1.2725424766540527, "logps/rejected": -1.3594197034835815, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.7477303147315979, "rewards/margins": -0.005827963352203369, "rewards/rejected": 0.7535582780838013, "step": 1026 }, { "epoch": 0.55, "learning_rate": 9.694884362603931e-08, "logits/chosen": -2.026143789291382, "logits/rejected": -2.030310869216919, "logps/chosen": -3.1398396492004395, "logps/rejected": -2.579376220703125, "loss": 0.4105, "rewards/accuracies": 1.0, "rewards/chosen": 1.1716059446334839, "rewards/margins": 0.6782134771347046, "rewards/rejected": 0.4933924674987793, "step": 1027 }, { "epoch": 0.55, "learning_rate": 9.694132735969399e-08, "logits/chosen": -2.026240110397339, "logits/rejected": -2.234355926513672, "logps/chosen": -1.156188726425171, "logps/rejected": -1.2297662496566772, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9361404776573181, "rewards/margins": 0.02635061740875244, "rewards/rejected": 0.9097898602485657, "step": 1028 }, { "epoch": 0.56, "learning_rate": 9.693380213897972e-08, "logits/chosen": -2.021801233291626, "logits/rejected": -2.0138328075408936, "logps/chosen": -3.7468841075897217, "logps/rejected": -4.784451007843018, "loss": 0.573, "rewards/accuracies": 1.0, "rewards/chosen": 0.7702677845954895, "rewards/margins": 0.2566460967063904, "rewards/rejected": 0.5136216878890991, "step": 1029 }, { "epoch": 0.56, "learning_rate": 9.692626796533201e-08, "logits/chosen": -1.960627794265747, "logits/rejected": -2.2715859413146973, "logps/chosen": -18.54566764831543, "logps/rejected": -1.2172727584838867, "loss": 0.6452, "rewards/accuracies": 1.0, "rewards/chosen": 0.8576936721801758, "rewards/margins": 0.09833574295043945, "rewards/rejected": 0.7593579292297363, "step": 1030 }, { "epoch": 0.56, "learning_rate": 9.691872484018806e-08, "logits/chosen": -2.095104932785034, "logits/rejected": -2.2525107860565186, "logps/chosen": -0.6813498735427856, "logps/rejected": -0.7078900933265686, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.8135658502578735, "rewards/margins": -0.0025499463081359863, "rewards/rejected": 0.8161157965660095, "step": 1031 }, { "epoch": 0.56, "learning_rate": 9.691117276498674e-08, "logits/chosen": -2.148493528366089, "logits/rejected": -2.150221347808838, "logps/chosen": -1.685810923576355, "logps/rejected": -2.342400550842285, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.9046909213066101, "rewards/margins": 0.0038771629333496094, "rewards/rejected": 0.9008137583732605, "step": 1032 }, { "epoch": 0.56, "learning_rate": 9.69036117411687e-08, "logits/chosen": -2.0556039810180664, "logits/rejected": -2.222498655319214, "logps/chosen": -8.946793556213379, "logps/rejected": -5.3266425132751465, "loss": 0.6718, "rewards/accuracies": 1.0, "rewards/chosen": 0.8629721999168396, "rewards/margins": 0.04309225082397461, "rewards/rejected": 0.819879949092865, "step": 1033 }, { "epoch": 0.56, "learning_rate": 9.689604177017622e-08, "logits/chosen": -2.166611909866333, "logits/rejected": -2.163142442703247, "logps/chosen": -5.46129035949707, "logps/rejected": -3.4493770599365234, "loss": 0.3166, "rewards/accuracies": 1.0, "rewards/chosen": 1.5375889539718628, "rewards/margins": 0.98758864402771, "rewards/rejected": 0.5500003099441528, "step": 1034 }, { "epoch": 0.56, "learning_rate": 9.688846285345333e-08, "logits/chosen": -2.0873587131500244, "logits/rejected": -2.248622417449951, "logps/chosen": -0.9328561425209045, "logps/rejected": -1.023538589477539, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.8766315579414368, "rewards/margins": 0.028759479522705078, "rewards/rejected": 0.8478720784187317, "step": 1035 }, { "epoch": 0.56, "learning_rate": 9.688087499244578e-08, "logits/chosen": -2.013209342956543, "logits/rejected": -2.0142970085144043, "logps/chosen": -5.030625820159912, "logps/rejected": -1.0322741270065308, "loss": 0.5616, "rewards/accuracies": 1.0, "rewards/chosen": 1.2420618534088135, "rewards/margins": 0.28302913904190063, "rewards/rejected": 0.9590327143669128, "step": 1036 }, { "epoch": 0.56, "learning_rate": 9.687327818860098e-08, "logits/chosen": -2.1871883869171143, "logits/rejected": -2.180901527404785, "logps/chosen": -8.3762788772583, "logps/rejected": -3.477353572845459, "loss": 0.509, "rewards/accuracies": 1.0, "rewards/chosen": 1.1736100912094116, "rewards/margins": 0.40997618436813354, "rewards/rejected": 0.7636339068412781, "step": 1037 }, { "epoch": 0.56, "learning_rate": 9.686567244336808e-08, "logits/chosen": -1.9625204801559448, "logits/rejected": -2.222539186477661, "logps/chosen": -0.6690754294395447, "logps/rejected": -0.727545440196991, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.8941099047660828, "rewards/margins": 0.027062594890594482, "rewards/rejected": 0.8670473098754883, "step": 1038 }, { "epoch": 0.56, "learning_rate": 9.685805775819793e-08, "logits/chosen": -1.9629075527191162, "logits/rejected": -2.0051352977752686, "logps/chosen": -9.146944999694824, "logps/rejected": -8.325749397277832, "loss": 0.3956, "rewards/accuracies": 1.0, "rewards/chosen": 1.4211032390594482, "rewards/margins": 0.7230687737464905, "rewards/rejected": 0.6980344653129578, "step": 1039 }, { "epoch": 0.56, "learning_rate": 9.685043413454308e-08, "logits/chosen": -1.9572266340255737, "logits/rejected": -2.2444045543670654, "logps/chosen": -2.7961742877960205, "logps/rejected": -2.561718702316284, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.6302459836006165, "rewards/margins": -0.008743584156036377, "rewards/rejected": 0.6389895677566528, "step": 1040 }, { "epoch": 0.56, "learning_rate": 9.684280157385776e-08, "logits/chosen": -2.1613471508026123, "logits/rejected": -2.322361469268799, "logps/chosen": -0.6955922245979309, "logps/rejected": -0.7206503748893738, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8674108386039734, "rewards/margins": 0.02231365442276001, "rewards/rejected": 0.8450971841812134, "step": 1041 }, { "epoch": 0.56, "learning_rate": 9.683516007759798e-08, "logits/chosen": -2.136253595352173, "logits/rejected": -2.1346120834350586, "logps/chosen": -7.020175933837891, "logps/rejected": -3.647575855255127, "loss": 0.3958, "rewards/accuracies": 1.0, "rewards/chosen": 1.2777938842773438, "rewards/margins": 0.7225414514541626, "rewards/rejected": 0.5552524328231812, "step": 1042 }, { "epoch": 0.56, "learning_rate": 9.682750964722137e-08, "logits/chosen": -2.070692300796509, "logits/rejected": -2.0719027519226074, "logps/chosen": -3.6855175495147705, "logps/rejected": -0.9786580801010132, "loss": 0.617, "rewards/accuracies": 1.0, "rewards/chosen": 1.1926215887069702, "rewards/margins": 0.15848088264465332, "rewards/rejected": 1.034140706062317, "step": 1043 }, { "epoch": 0.56, "learning_rate": 9.681985028418732e-08, "logits/chosen": -2.0917398929595947, "logits/rejected": -2.0935709476470947, "logps/chosen": -1.4749135971069336, "logps/rejected": -2.5207033157348633, "loss": 0.5519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9528800845146179, "rewards/margins": 0.30578362941741943, "rewards/rejected": 0.6470964550971985, "step": 1044 }, { "epoch": 0.56, "learning_rate": 9.681218198995688e-08, "logits/chosen": -2.0002384185791016, "logits/rejected": -2.2494423389434814, "logps/chosen": -1.2335116863250732, "logps/rejected": -1.2960933446884155, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.6510828137397766, "rewards/margins": 0.0040593743324279785, "rewards/rejected": 0.6470234394073486, "step": 1045 }, { "epoch": 0.56, "learning_rate": 9.680450476599287e-08, "logits/chosen": -2.2011468410491943, "logits/rejected": -2.303283452987671, "logps/chosen": -0.6997905373573303, "logps/rejected": -0.7681913375854492, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 1.019176721572876, "rewards/margins": 0.007392764091491699, "rewards/rejected": 1.0117839574813843, "step": 1046 }, { "epoch": 0.56, "learning_rate": 9.679681861375972e-08, "logits/chosen": -2.0947635173797607, "logits/rejected": -2.061152458190918, "logps/chosen": -4.2470245361328125, "logps/rejected": -2.4365427494049072, "loss": 0.5316, "rewards/accuracies": 1.0, "rewards/chosen": 1.0162410736083984, "rewards/margins": 0.35432106256484985, "rewards/rejected": 0.6619200110435486, "step": 1047 }, { "epoch": 0.57, "learning_rate": 9.678912353472366e-08, "logits/chosen": -2.121605396270752, "logits/rejected": -2.293869733810425, "logps/chosen": -4.0884318351745605, "logps/rejected": -4.237685680389404, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.8255643844604492, "rewards/margins": 0.033156514167785645, "rewards/rejected": 0.7924078702926636, "step": 1048 }, { "epoch": 0.57, "learning_rate": 9.678141953035254e-08, "logits/chosen": -2.0456161499023438, "logits/rejected": -2.2700629234313965, "logps/chosen": -2.726881742477417, "logps/rejected": -7.48921012878418, "loss": 0.6379, "rewards/accuracies": 1.0, "rewards/chosen": 0.7147125601768494, "rewards/margins": 0.11362707614898682, "rewards/rejected": 0.6010854840278625, "step": 1049 }, { "epoch": 0.57, "learning_rate": 9.677370660211597e-08, "logits/chosen": -2.0511269569396973, "logits/rejected": -2.052154302597046, "logps/chosen": -0.575202465057373, "logps/rejected": -3.4103474617004395, "loss": 0.5598, "rewards/accuracies": 1.0, "rewards/chosen": 0.8940349817276001, "rewards/margins": 0.28728777170181274, "rewards/rejected": 0.6067472100257874, "step": 1050 }, { "epoch": 0.57, "learning_rate": 9.676598475148524e-08, "logits/chosen": -2.068119525909424, "logits/rejected": -2.1967642307281494, "logps/chosen": -4.974937438964844, "logps/rejected": -0.7612846493721008, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.7380855679512024, "rewards/margins": 0.03300464153289795, "rewards/rejected": 0.7050809264183044, "step": 1051 }, { "epoch": 0.57, "learning_rate": 9.675825397993335e-08, "logits/chosen": -2.0269134044647217, "logits/rejected": -2.2487192153930664, "logps/chosen": -0.5152321457862854, "logps/rejected": -0.5124012231826782, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.8040469288825989, "rewards/margins": 0.030338168144226074, "rewards/rejected": 0.7737087607383728, "step": 1052 }, { "epoch": 0.57, "learning_rate": 9.675051428893497e-08, "logits/chosen": -1.9404090642929077, "logits/rejected": -2.2338340282440186, "logps/chosen": -4.700488567352295, "logps/rejected": -4.967685222625732, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.8807619214057922, "rewards/margins": 0.03940141201019287, "rewards/rejected": 0.8413605093955994, "step": 1053 }, { "epoch": 0.57, "learning_rate": 9.674276567996653e-08, "logits/chosen": -2.060603618621826, "logits/rejected": -2.0658721923828125, "logps/chosen": -2.2128710746765137, "logps/rejected": -3.218985080718994, "loss": 0.4569, "rewards/accuracies": 1.0, "rewards/chosen": 1.0804986953735352, "rewards/margins": 0.5462199449539185, "rewards/rejected": 0.5342787504196167, "step": 1054 }, { "epoch": 0.57, "learning_rate": 9.673500815450612e-08, "logits/chosen": -2.1363561153411865, "logits/rejected": -2.281250238418579, "logps/chosen": -5.157094955444336, "logps/rejected": -2.5852088928222656, "loss": 0.6376, "rewards/accuracies": 1.0, "rewards/chosen": 0.956603467464447, "rewards/margins": 0.11427044868469238, "rewards/rejected": 0.8423330187797546, "step": 1055 }, { "epoch": 0.57, "learning_rate": 9.672724171403352e-08, "logits/chosen": -2.0836172103881836, "logits/rejected": -2.089937925338745, "logps/chosen": -2.955303192138672, "logps/rejected": -4.703804969787598, "loss": 0.417, "rewards/accuracies": 1.0, "rewards/chosen": 1.1810581684112549, "rewards/margins": 0.65897136926651, "rewards/rejected": 0.5220867991447449, "step": 1056 }, { "epoch": 0.57, "learning_rate": 9.671946636003025e-08, "logits/chosen": -2.0909955501556396, "logits/rejected": -2.0226924419403076, "logps/chosen": -24.240205764770508, "logps/rejected": -3.024317979812622, "loss": 0.4699, "rewards/accuracies": 1.0, "rewards/chosen": 1.094690203666687, "rewards/margins": 0.5110017657279968, "rewards/rejected": 0.5836884379386902, "step": 1057 }, { "epoch": 0.57, "learning_rate": 9.67116820939795e-08, "logits/chosen": -2.1414411067962646, "logits/rejected": -2.2937355041503906, "logps/chosen": -1.0882259607315063, "logps/rejected": -1.0946273803710938, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.961037814617157, "rewards/margins": 0.005639314651489258, "rewards/rejected": 0.9553984999656677, "step": 1058 }, { "epoch": 0.57, "learning_rate": 9.670388891736618e-08, "logits/chosen": -2.032646417617798, "logits/rejected": -2.0306053161621094, "logps/chosen": -2.354647159576416, "logps/rejected": -3.437565565109253, "loss": 0.4523, "rewards/accuracies": 1.0, "rewards/chosen": 1.1848992109298706, "rewards/margins": 0.5586609244346619, "rewards/rejected": 0.6262382864952087, "step": 1059 }, { "epoch": 0.57, "learning_rate": 9.669608683167688e-08, "logits/chosen": -2.1257266998291016, "logits/rejected": -2.2736330032348633, "logps/chosen": -3.5980594158172607, "logps/rejected": -3.424889326095581, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.9436251521110535, "rewards/margins": 0.009784460067749023, "rewards/rejected": 0.9338406920433044, "step": 1060 }, { "epoch": 0.57, "learning_rate": 9.668827583839989e-08, "logits/chosen": -1.9935848712921143, "logits/rejected": -2.2603132724761963, "logps/chosen": -1.389067530632019, "logps/rejected": -1.5131996870040894, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750593304634094, "rewards/margins": 0.013137519359588623, "rewards/rejected": 0.9619218111038208, "step": 1061 }, { "epoch": 0.57, "learning_rate": 9.668045593902525e-08, "logits/chosen": -2.0377390384674072, "logits/rejected": -2.0394279956817627, "logps/chosen": -11.70062255859375, "logps/rejected": -3.378819465637207, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": 1.7225760221481323, "rewards/margins": 1.2413643598556519, "rewards/rejected": 0.48121166229248047, "step": 1062 }, { "epoch": 0.57, "learning_rate": 9.66726271350446e-08, "logits/chosen": -2.0933167934417725, "logits/rejected": -2.3181517124176025, "logps/chosen": -1.2766553163528442, "logps/rejected": -6.092528343200684, "loss": 0.5741, "rewards/accuracies": 1.0, "rewards/chosen": 0.9380640387535095, "rewards/margins": 0.2541430592536926, "rewards/rejected": 0.6839209794998169, "step": 1063 }, { "epoch": 0.57, "learning_rate": 9.66647894279514e-08, "logits/chosen": -2.07137131690979, "logits/rejected": -2.0546960830688477, "logps/chosen": -11.381498336791992, "logps/rejected": -1.2209643125534058, "loss": 0.4854, "rewards/accuracies": 1.0, "rewards/chosen": 1.3542470932006836, "rewards/margins": 0.4703526496887207, "rewards/rejected": 0.8838944435119629, "step": 1064 }, { "epoch": 0.57, "learning_rate": 9.665694281924069e-08, "logits/chosen": -1.940920114517212, "logits/rejected": -2.1942031383514404, "logps/chosen": -0.4184863567352295, "logps/rejected": -0.41430217027664185, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.9008760452270508, "rewards/margins": 0.011199951171875, "rewards/rejected": 0.8896760940551758, "step": 1065 }, { "epoch": 0.57, "learning_rate": 9.664908731040928e-08, "logits/chosen": -2.1316916942596436, "logits/rejected": -2.129134178161621, "logps/chosen": -8.341446876525879, "logps/rejected": -2.5954551696777344, "loss": 0.5084, "rewards/accuracies": 1.0, "rewards/chosen": 1.1196004152297974, "rewards/margins": 0.41144007444381714, "rewards/rejected": 0.7081603407859802, "step": 1066 }, { "epoch": 0.58, "learning_rate": 9.664122290295566e-08, "logits/chosen": -1.9797719717025757, "logits/rejected": -2.316126585006714, "logps/chosen": -0.39829114079475403, "logps/rejected": -0.4526704251766205, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 0.971591591835022, "rewards/margins": -0.00912386178970337, "rewards/rejected": 0.9807154536247253, "step": 1067 }, { "epoch": 0.58, "learning_rate": 9.663334959838003e-08, "logits/chosen": -1.9719637632369995, "logits/rejected": -1.9715420007705688, "logps/chosen": -10.691985130310059, "logps/rejected": -1.4118680953979492, "loss": 0.5277, "rewards/accuracies": 1.0, "rewards/chosen": 1.2259315252304077, "rewards/margins": 0.3639217019081116, "rewards/rejected": 0.8620098233222961, "step": 1068 }, { "epoch": 0.58, "learning_rate": 9.662546739818427e-08, "logits/chosen": -2.05594801902771, "logits/rejected": -2.231217622756958, "logps/chosen": -0.8884211182594299, "logps/rejected": -0.8702710866928101, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.7739009857177734, "rewards/margins": 0.016269922256469727, "rewards/rejected": 0.7576310634613037, "step": 1069 }, { "epoch": 0.58, "learning_rate": 9.661757630387195e-08, "logits/chosen": -1.973673939704895, "logits/rejected": -2.248669147491455, "logps/chosen": -0.9255067110061646, "logps/rejected": -0.8746011257171631, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.7519102096557617, "rewards/margins": 0.014825046062469482, "rewards/rejected": 0.7370851635932922, "step": 1070 }, { "epoch": 0.58, "learning_rate": 9.660967631694837e-08, "logits/chosen": -2.191215991973877, "logits/rejected": -2.1897711753845215, "logps/chosen": -1.3661463260650635, "logps/rejected": -4.501672744750977, "loss": 0.5202, "rewards/accuracies": 1.0, "rewards/chosen": 1.0072975158691406, "rewards/margins": 0.3821753263473511, "rewards/rejected": 0.6251221895217896, "step": 1071 }, { "epoch": 0.58, "learning_rate": 9.660176743892048e-08, "logits/chosen": -1.9759628772735596, "logits/rejected": -1.9736629724502563, "logps/chosen": -3.2847225666046143, "logps/rejected": -7.502444267272949, "loss": 0.3503, "rewards/accuracies": 1.0, "rewards/chosen": 1.3593385219573975, "rewards/margins": 0.8686239719390869, "rewards/rejected": 0.49071455001831055, "step": 1072 }, { "epoch": 0.58, "learning_rate": 9.659384967129695e-08, "logits/chosen": -1.9823474884033203, "logits/rejected": -1.981200933456421, "logps/chosen": -2.4336068630218506, "logps/rejected": -3.017314910888672, "loss": 0.5511, "rewards/accuracies": 1.0, "rewards/chosen": 0.8878390192985535, "rewards/margins": 0.3077528476715088, "rewards/rejected": 0.5800861716270447, "step": 1073 }, { "epoch": 0.58, "learning_rate": 9.658592301558819e-08, "logits/chosen": -2.028846263885498, "logits/rejected": -2.103231191635132, "logps/chosen": -4.515023708343506, "logps/rejected": -12.135860443115234, "loss": 0.3601, "rewards/accuracies": 1.0, "rewards/chosen": 1.2816364765167236, "rewards/margins": 0.8357889652252197, "rewards/rejected": 0.4458475112915039, "step": 1074 }, { "epoch": 0.58, "learning_rate": 9.657798747330621e-08, "logits/chosen": -2.0787465572357178, "logits/rejected": -2.0795321464538574, "logps/chosen": -3.9127116203308105, "logps/rejected": -1.1115798950195312, "loss": 0.6284, "rewards/accuracies": 1.0, "rewards/chosen": 0.9518957138061523, "rewards/margins": 0.13408058881759644, "rewards/rejected": 0.8178151249885559, "step": 1075 }, { "epoch": 0.58, "learning_rate": 9.657004304596479e-08, "logits/chosen": -2.0806849002838135, "logits/rejected": -2.085556983947754, "logps/chosen": -2.2230191230773926, "logps/rejected": -2.949312925338745, "loss": 0.4938, "rewards/accuracies": 1.0, "rewards/chosen": 1.024666428565979, "rewards/margins": 0.4487031102180481, "rewards/rejected": 0.5759633183479309, "step": 1076 }, { "epoch": 0.58, "learning_rate": 9.656208973507938e-08, "logits/chosen": -2.1182892322540283, "logits/rejected": -2.144390344619751, "logps/chosen": -9.105131149291992, "logps/rejected": -18.32587242126465, "loss": 0.3622, "rewards/accuracies": 1.0, "rewards/chosen": 1.2044477462768555, "rewards/margins": 0.8288843035697937, "rewards/rejected": 0.37556344270706177, "step": 1077 }, { "epoch": 0.58, "learning_rate": 9.655412754216712e-08, "logits/chosen": -1.9960755109786987, "logits/rejected": -2.0029892921447754, "logps/chosen": -1.9525580406188965, "logps/rejected": -2.7316770553588867, "loss": 0.4195, "rewards/accuracies": 1.0, "rewards/chosen": 1.273858666419983, "rewards/margins": 0.651639461517334, "rewards/rejected": 0.6222192049026489, "step": 1078 }, { "epoch": 0.58, "learning_rate": 9.654615646874686e-08, "logits/chosen": -2.0428755283355713, "logits/rejected": -2.2701234817504883, "logps/chosen": -0.7398020029067993, "logps/rejected": -0.7292104959487915, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.803252637386322, "rewards/margins": 0.016768038272857666, "rewards/rejected": 0.7864845991134644, "step": 1079 }, { "epoch": 0.58, "learning_rate": 9.653817651633915e-08, "logits/chosen": -2.24599289894104, "logits/rejected": -2.167280435562134, "logps/chosen": -50.985435485839844, "logps/rejected": -5.453951358795166, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 1.1764366626739502, "rewards/margins": 0.35557442903518677, "rewards/rejected": 0.8208622336387634, "step": 1080 }, { "epoch": 0.58, "learning_rate": 9.653018768646619e-08, "logits/chosen": -2.0619189739227295, "logits/rejected": -2.049687385559082, "logps/chosen": -5.137975692749023, "logps/rejected": -4.628176212310791, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 1.3552440404891968, "rewards/margins": 0.8893964290618896, "rewards/rejected": 0.4658476412296295, "step": 1081 }, { "epoch": 0.58, "learning_rate": 9.652218998065191e-08, "logits/chosen": -2.1018381118774414, "logits/rejected": -2.2533867359161377, "logps/chosen": -2.6933486461639404, "logps/rejected": -2.565795421600342, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.9893758893013, "rewards/margins": 0.0039501190185546875, "rewards/rejected": 0.9854257702827454, "step": 1082 }, { "epoch": 0.58, "learning_rate": 9.651418340042194e-08, "logits/chosen": -2.0175607204437256, "logits/rejected": -2.0151925086975098, "logps/chosen": -6.705609321594238, "logps/rejected": -3.7911336421966553, "loss": 0.3967, "rewards/accuracies": 1.0, "rewards/chosen": 1.234086513519287, "rewards/margins": 0.7197079658508301, "rewards/rejected": 0.514378547668457, "step": 1083 }, { "epoch": 0.58, "learning_rate": 9.650616794730356e-08, "logits/chosen": -1.9485543966293335, "logits/rejected": -1.9488738775253296, "logps/chosen": -5.008952617645264, "logps/rejected": -2.209758996963501, "loss": 0.401, "rewards/accuracies": 1.0, "rewards/chosen": 1.3594955205917358, "rewards/margins": 0.7067012190818787, "rewards/rejected": 0.6527943015098572, "step": 1084 }, { "epoch": 0.59, "learning_rate": 9.64981436228258e-08, "logits/chosen": -2.0612785816192627, "logits/rejected": -2.279115915298462, "logps/chosen": -9.454439163208008, "logps/rejected": -10.915202140808105, "loss": 0.6115, "rewards/accuracies": 1.0, "rewards/chosen": 0.8067952990531921, "rewards/margins": 0.17047184705734253, "rewards/rejected": 0.6363234519958496, "step": 1085 }, { "epoch": 0.59, "learning_rate": 9.649011042851933e-08, "logits/chosen": -2.106755256652832, "logits/rejected": -2.0150156021118164, "logps/chosen": -33.0238037109375, "logps/rejected": -3.4760448932647705, "loss": 0.3987, "rewards/accuracies": 1.0, "rewards/chosen": 1.2881237268447876, "rewards/margins": 0.7137246131896973, "rewards/rejected": 0.5743991136550903, "step": 1086 }, { "epoch": 0.59, "learning_rate": 9.648206836591656e-08, "logits/chosen": -1.96021568775177, "logits/rejected": -2.2471466064453125, "logps/chosen": -0.7365649938583374, "logps/rejected": -0.8072270154953003, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 0.7433640360832214, "rewards/margins": -0.002118408679962158, "rewards/rejected": 0.7454824447631836, "step": 1087 }, { "epoch": 0.59, "learning_rate": 9.647401743655155e-08, "logits/chosen": -2.0969793796539307, "logits/rejected": -2.127957582473755, "logps/chosen": -3.552691698074341, "logps/rejected": -24.446969985961914, "loss": 0.5841, "rewards/accuracies": 1.0, "rewards/chosen": 1.0060938596725464, "rewards/margins": 0.23154151439666748, "rewards/rejected": 0.7745523452758789, "step": 1088 }, { "epoch": 0.59, "learning_rate": 9.646595764196006e-08, "logits/chosen": -2.021742105484009, "logits/rejected": -2.0081145763397217, "logps/chosen": -4.0990166664123535, "logps/rejected": -2.367894411087036, "loss": 0.5177, "rewards/accuracies": 1.0, "rewards/chosen": 1.0876502990722656, "rewards/margins": 0.38834673166275024, "rewards/rejected": 0.6993035674095154, "step": 1089 }, { "epoch": 0.59, "learning_rate": 9.645788898367956e-08, "logits/chosen": -1.9552502632141113, "logits/rejected": -1.9597243070602417, "logps/chosen": -3.097114324569702, "logps/rejected": -3.847956418991089, "loss": 0.392, "rewards/accuracies": 1.0, "rewards/chosen": 1.3286592960357666, "rewards/margins": 0.7341442704200745, "rewards/rejected": 0.5945150256156921, "step": 1090 }, { "epoch": 0.59, "learning_rate": 9.644981146324922e-08, "logits/chosen": -1.973331332206726, "logits/rejected": -2.29651141166687, "logps/chosen": -0.5541804432868958, "logps/rejected": -0.6190695762634277, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9456900954246521, "rewards/margins": 0.0011046528816223145, "rewards/rejected": 0.9445854425430298, "step": 1091 }, { "epoch": 0.59, "learning_rate": 9.644172508220985e-08, "logits/chosen": -2.136934995651245, "logits/rejected": -2.2808997631073, "logps/chosen": -0.8258074522018433, "logps/rejected": -0.7217227816581726, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.9066866040229797, "rewards/margins": -0.0031784772872924805, "rewards/rejected": 0.9098650813102722, "step": 1092 }, { "epoch": 0.59, "learning_rate": 9.643362984210399e-08, "logits/chosen": -2.0019516944885254, "logits/rejected": -2.211238145828247, "logps/chosen": -0.6329044103622437, "logps/rejected": -0.7221672534942627, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.7929303050041199, "rewards/margins": 0.032922327518463135, "rewards/rejected": 0.7600079774856567, "step": 1093 }, { "epoch": 0.59, "learning_rate": 9.642552574447587e-08, "logits/chosen": -2.1312484741210938, "logits/rejected": -2.3087267875671387, "logps/chosen": -6.607151985168457, "logps/rejected": -6.463875770568848, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.8144623637199402, "rewards/margins": 0.004034519195556641, "rewards/rejected": 0.8104278445243835, "step": 1094 }, { "epoch": 0.59, "learning_rate": 9.641741279087136e-08, "logits/chosen": -2.0239083766937256, "logits/rejected": -2.3022170066833496, "logps/chosen": -2.67471981048584, "logps/rejected": -1.1479952335357666, "loss": 0.7152, "rewards/accuracies": 0.0, "rewards/chosen": 0.9742966890335083, "rewards/margins": -0.043665289878845215, "rewards/rejected": 1.0179619789123535, "step": 1095 }, { "epoch": 0.59, "learning_rate": 9.640929098283812e-08, "logits/chosen": -2.0974862575531006, "logits/rejected": -2.2988388538360596, "logps/chosen": -0.6813235282897949, "logps/rejected": -0.7496495842933655, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.8117704391479492, "rewards/margins": 0.01850229501724243, "rewards/rejected": 0.7932681441307068, "step": 1096 }, { "epoch": 0.59, "learning_rate": 9.640116032192541e-08, "logits/chosen": -2.019742488861084, "logits/rejected": -2.2116551399230957, "logps/chosen": -2.9317662715911865, "logps/rejected": -2.89378023147583, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.8090789914131165, "rewards/margins": 0.020610272884368896, "rewards/rejected": 0.7884687185287476, "step": 1097 }, { "epoch": 0.59, "learning_rate": 9.63930208096842e-08, "logits/chosen": -1.8930118083953857, "logits/rejected": -1.9017051458358765, "logps/chosen": -2.825596570968628, "logps/rejected": -4.542858123779297, "loss": 0.4265, "rewards/accuracies": 1.0, "rewards/chosen": 1.1908448934555054, "rewards/margins": 0.631203830242157, "rewards/rejected": 0.5596410632133484, "step": 1098 }, { "epoch": 0.59, "learning_rate": 9.638487244766717e-08, "logits/chosen": -1.9913973808288574, "logits/rejected": -2.269134283065796, "logps/chosen": -0.6344667673110962, "logps/rejected": -0.7313399314880371, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9295390248298645, "rewards/margins": 0.01619899272918701, "rewards/rejected": 0.9133400321006775, "step": 1099 }, { "epoch": 0.59, "learning_rate": 9.637671523742866e-08, "logits/chosen": -2.008587121963501, "logits/rejected": -2.0021374225616455, "logps/chosen": -16.155073165893555, "logps/rejected": -5.353484153747559, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": 1.7070688009262085, "rewards/margins": 1.002187728881836, "rewards/rejected": 0.7048810124397278, "step": 1100 }, { "epoch": 0.59, "learning_rate": 9.636854918052473e-08, "logits/chosen": -2.0245602130889893, "logits/rejected": -2.0321450233459473, "logps/chosen": -3.0892789363861084, "logps/rejected": -1.922329306602478, "loss": 0.3608, "rewards/accuracies": 1.0, "rewards/chosen": 1.5245510339736938, "rewards/margins": 0.833561360836029, "rewards/rejected": 0.6909896731376648, "step": 1101 }, { "epoch": 0.59, "learning_rate": 9.636037427851309e-08, "logits/chosen": -2.0569851398468018, "logits/rejected": -2.0565872192382812, "logps/chosen": -1.3357242345809937, "logps/rejected": -1.5125558376312256, "loss": 0.7017, "rewards/accuracies": 0.0, "rewards/chosen": 0.8701789975166321, "rewards/margins": -0.017042160034179688, "rewards/rejected": 0.8872211575508118, "step": 1102 }, { "epoch": 0.59, "learning_rate": 9.635219053295316e-08, "logits/chosen": -2.1146605014801025, "logits/rejected": -2.1113836765289307, "logps/chosen": -7.861519813537598, "logps/rejected": -2.6326873302459717, "loss": 0.3254, "rewards/accuracies": 1.0, "rewards/chosen": 1.5078539848327637, "rewards/margins": 0.955471396446228, "rewards/rejected": 0.5523825883865356, "step": 1103 }, { "epoch": 0.6, "learning_rate": 9.634399794540605e-08, "logits/chosen": -2.0103628635406494, "logits/rejected": -2.010404109954834, "logps/chosen": -0.47005969285964966, "logps/rejected": -2.8666388988494873, "loss": 0.5365, "rewards/accuracies": 1.0, "rewards/chosen": 0.8628005981445312, "rewards/margins": 0.34247320890426636, "rewards/rejected": 0.5203273892402649, "step": 1104 }, { "epoch": 0.6, "learning_rate": 9.633579651743456e-08, "logits/chosen": -1.9721976518630981, "logits/rejected": -2.2161314487457275, "logps/chosen": -0.3993889391422272, "logps/rejected": -0.44298622012138367, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.8837427496910095, "rewards/margins": 0.010733544826507568, "rewards/rejected": 0.873009204864502, "step": 1105 }, { "epoch": 0.6, "learning_rate": 9.632758625060315e-08, "logits/chosen": -2.156741142272949, "logits/rejected": -2.29911208152771, "logps/chosen": -2.4737184047698975, "logps/rejected": -2.146435499191284, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 0.9548807144165039, "rewards/margins": -0.02174520492553711, "rewards/rejected": 0.976625919342041, "step": 1106 }, { "epoch": 0.6, "learning_rate": 9.631936714647799e-08, "logits/chosen": -2.0463736057281494, "logits/rejected": -2.213590383529663, "logps/chosen": -0.5296199321746826, "logps/rejected": -0.5869122743606567, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 1.0050071477890015, "rewards/margins": 0.024259507656097412, "rewards/rejected": 0.980747640132904, "step": 1107 }, { "epoch": 0.6, "learning_rate": 9.631113920662692e-08, "logits/chosen": -1.9952305555343628, "logits/rejected": -2.2413928508758545, "logps/chosen": -0.37234967947006226, "logps/rejected": -0.3897281587123871, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.908985435962677, "rewards/margins": -0.00019109249114990234, "rewards/rejected": 0.9091765284538269, "step": 1108 }, { "epoch": 0.6, "learning_rate": 9.63029024326195e-08, "logits/chosen": -2.092506170272827, "logits/rejected": -2.091076135635376, "logps/chosen": -1.179218053817749, "logps/rejected": -1.6150734424591064, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.9269067049026489, "rewards/margins": 0.03949844837188721, "rewards/rejected": 0.8874082565307617, "step": 1109 }, { "epoch": 0.6, "learning_rate": 9.629465682602691e-08, "logits/chosen": -2.1408021450042725, "logits/rejected": -2.140763759613037, "logps/chosen": -4.184080123901367, "logps/rejected": -3.368323564529419, "loss": 0.4782, "rewards/accuracies": 1.0, "rewards/chosen": 1.3013235330581665, "rewards/margins": 0.4892429709434509, "rewards/rejected": 0.8120805621147156, "step": 1110 }, { "epoch": 0.6, "learning_rate": 9.628640238842208e-08, "logits/chosen": -2.0434207916259766, "logits/rejected": -2.0453906059265137, "logps/chosen": -5.920599937438965, "logps/rejected": -2.6050853729248047, "loss": 0.5028, "rewards/accuracies": 1.0, "rewards/chosen": 1.2695530652999878, "rewards/margins": 0.4255450963973999, "rewards/rejected": 0.8440079689025879, "step": 1111 }, { "epoch": 0.6, "learning_rate": 9.62781391213796e-08, "logits/chosen": -2.1175057888031006, "logits/rejected": -2.112779140472412, "logps/chosen": -5.411489009857178, "logps/rejected": -4.622114658355713, "loss": 0.3966, "rewards/accuracies": 1.0, "rewards/chosen": 1.2975826263427734, "rewards/margins": 0.7199279069900513, "rewards/rejected": 0.5776547193527222, "step": 1112 }, { "epoch": 0.6, "learning_rate": 9.626986702647573e-08, "logits/chosen": -2.103804111480713, "logits/rejected": -2.334846258163452, "logps/chosen": -1.136012315750122, "logps/rejected": -1.1683017015457153, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9523148536682129, "rewards/margins": 0.01881200075149536, "rewards/rejected": 0.9335028529167175, "step": 1113 }, { "epoch": 0.6, "learning_rate": 9.626158610528843e-08, "logits/chosen": -2.1665596961975098, "logits/rejected": -2.266752004623413, "logps/chosen": -2.462833881378174, "logps/rejected": -2.3838212490081787, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.9367117285728455, "rewards/margins": 0.016679584980010986, "rewards/rejected": 0.9200321435928345, "step": 1114 }, { "epoch": 0.6, "learning_rate": 9.625329635939735e-08, "logits/chosen": -2.25311541557312, "logits/rejected": -2.10870623588562, "logps/chosen": -36.766597747802734, "logps/rejected": -3.3445897102355957, "loss": 0.388, "rewards/accuracies": 1.0, "rewards/chosen": 1.3230152130126953, "rewards/margins": 0.746337354183197, "rewards/rejected": 0.5766778588294983, "step": 1115 }, { "epoch": 0.6, "learning_rate": 9.624499779038381e-08, "logits/chosen": -1.955525517463684, "logits/rejected": -2.2729265689849854, "logps/chosen": -2.0495107173919678, "logps/rejected": -2.0943520069122314, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8709800839424133, "rewards/margins": 0.015051484107971191, "rewards/rejected": 0.8559285998344421, "step": 1116 }, { "epoch": 0.6, "learning_rate": 9.62366903998308e-08, "logits/chosen": -2.047853469848633, "logits/rejected": -2.28456711769104, "logps/chosen": -2.0205917358398438, "logps/rejected": -1.9891059398651123, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.9133954048156738, "rewards/margins": -0.007974565029144287, "rewards/rejected": 0.9213699698448181, "step": 1117 }, { "epoch": 0.6, "learning_rate": 9.622837418932303e-08, "logits/chosen": -1.9539076089859009, "logits/rejected": -1.9606505632400513, "logps/chosen": -1.9983646869659424, "logps/rejected": -3.047497034072876, "loss": 0.4777, "rewards/accuracies": 1.0, "rewards/chosen": 1.0864026546478271, "rewards/margins": 0.49046164751052856, "rewards/rejected": 0.5959410071372986, "step": 1118 }, { "epoch": 0.6, "learning_rate": 9.622004916044688e-08, "logits/chosen": -1.9875357151031494, "logits/rejected": -1.9886664152145386, "logps/chosen": -1.6476294994354248, "logps/rejected": -2.738332748413086, "loss": 0.5048, "rewards/accuracies": 1.0, "rewards/chosen": 1.0920075178146362, "rewards/margins": 0.42050808668136597, "rewards/rejected": 0.6714994311332703, "step": 1119 }, { "epoch": 0.6, "learning_rate": 9.621171531479037e-08, "logits/chosen": -2.144547462463379, "logits/rejected": -2.3035130500793457, "logps/chosen": -1.3869304656982422, "logps/rejected": -1.3862804174423218, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": 0.8608460426330566, "rewards/margins": -0.01852238178253174, "rewards/rejected": 0.8793684244155884, "step": 1120 }, { "epoch": 0.6, "learning_rate": 9.620337265394327e-08, "logits/chosen": -2.1166534423828125, "logits/rejected": -2.12544322013855, "logps/chosen": -1.9550130367279053, "logps/rejected": -4.030482769012451, "loss": 0.4008, "rewards/accuracies": 1.0, "rewards/chosen": 1.3030184507369995, "rewards/margins": 0.7073119282722473, "rewards/rejected": 0.5957065224647522, "step": 1121 }, { "epoch": 0.61, "learning_rate": 9.619502117949698e-08, "logits/chosen": -2.0928871631622314, "logits/rejected": -2.249924421310425, "logps/chosen": -0.6339318156242371, "logps/rejected": -0.6036701798439026, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.9047350883483887, "rewards/margins": 0.03442269563674927, "rewards/rejected": 0.8703123927116394, "step": 1122 }, { "epoch": 0.61, "learning_rate": 9.618666089304461e-08, "logits/chosen": -2.041049003601074, "logits/rejected": -2.041322946548462, "logps/chosen": -1.128182053565979, "logps/rejected": -3.8574812412261963, "loss": 0.6181, "rewards/accuracies": 1.0, "rewards/chosen": 0.9638287425041199, "rewards/margins": 0.15608805418014526, "rewards/rejected": 0.8077406883239746, "step": 1123 }, { "epoch": 0.61, "learning_rate": 9.617829179618093e-08, "logits/chosen": -2.044151544570923, "logits/rejected": -2.3006508350372314, "logps/chosen": -8.925511360168457, "logps/rejected": -3.2310657501220703, "loss": 0.7351, "rewards/accuracies": 0.0, "rewards/chosen": 0.8565869331359863, "rewards/margins": -0.08216279745101929, "rewards/rejected": 0.9387497305870056, "step": 1124 }, { "epoch": 0.61, "learning_rate": 9.61699138905024e-08, "logits/chosen": -1.9882564544677734, "logits/rejected": -1.9912787675857544, "logps/chosen": -1.7963967323303223, "logps/rejected": -4.528615474700928, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.0327389240264893, "rewards/margins": 0.6038216352462769, "rewards/rejected": 0.42891725897789, "step": 1125 }, { "epoch": 0.61, "learning_rate": 9.616152717760717e-08, "logits/chosen": -2.059393882751465, "logits/rejected": -2.2448513507843018, "logps/chosen": -0.8777087926864624, "logps/rejected": -0.8062111735343933, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.871182382106781, "rewards/margins": 0.012934207916259766, "rewards/rejected": 0.8582481741905212, "step": 1126 }, { "epoch": 0.61, "learning_rate": 9.615313165909505e-08, "logits/chosen": -2.1318564414978027, "logits/rejected": -2.308126926422119, "logps/chosen": -2.0489416122436523, "logps/rejected": -4.979173183441162, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8023006319999695, "rewards/margins": 0.009746313095092773, "rewards/rejected": 0.7925543189048767, "step": 1127 }, { "epoch": 0.61, "learning_rate": 9.614472733656756e-08, "logits/chosen": -2.0534026622772217, "logits/rejected": -2.058687686920166, "logps/chosen": -2.920379400253296, "logps/rejected": -3.0099263191223145, "loss": 0.4401, "rewards/accuracies": 1.0, "rewards/chosen": 1.3052752017974854, "rewards/margins": 0.5925523638725281, "rewards/rejected": 0.7127228379249573, "step": 1128 }, { "epoch": 0.61, "learning_rate": 9.613631421162786e-08, "logits/chosen": -2.002180576324463, "logits/rejected": -2.0066778659820557, "logps/chosen": -1.6223315000534058, "logps/rejected": -2.9754443168640137, "loss": 0.506, "rewards/accuracies": 1.0, "rewards/chosen": 0.9746490716934204, "rewards/margins": 0.4176930785179138, "rewards/rejected": 0.5569559931755066, "step": 1129 }, { "epoch": 0.61, "learning_rate": 9.612789228588081e-08, "logits/chosen": -2.0177228450775146, "logits/rejected": -2.0192108154296875, "logps/chosen": -0.8646901249885559, "logps/rejected": -2.587162971496582, "loss": 0.5432, "rewards/accuracies": 1.0, "rewards/chosen": 0.8492681384086609, "rewards/margins": 0.32629621028900146, "rewards/rejected": 0.5229719281196594, "step": 1130 }, { "epoch": 0.61, "learning_rate": 9.611946156093295e-08, "logits/chosen": -1.9574936628341675, "logits/rejected": -1.9618942737579346, "logps/chosen": -0.699543833732605, "logps/rejected": -5.230299472808838, "loss": 0.5607, "rewards/accuracies": 1.0, "rewards/chosen": 0.7829331755638123, "rewards/margins": 0.28524377942085266, "rewards/rejected": 0.4976893961429596, "step": 1131 }, { "epoch": 0.61, "learning_rate": 9.611102203839252e-08, "logits/chosen": -2.0609474182128906, "logits/rejected": -2.2942967414855957, "logps/chosen": -1.7501676082611084, "logps/rejected": -1.5692720413208008, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.7523527145385742, "rewards/margins": 0.01773965358734131, "rewards/rejected": 0.7346130609512329, "step": 1132 }, { "epoch": 0.61, "learning_rate": 9.610257371986938e-08, "logits/chosen": -2.0675196647644043, "logits/rejected": -2.249635696411133, "logps/chosen": -2.7530887126922607, "logps/rejected": -7.648968696594238, "loss": 0.7045, "rewards/accuracies": 0.0, "rewards/chosen": 1.087933897972107, "rewards/margins": -0.0226287841796875, "rewards/rejected": 1.1105626821517944, "step": 1133 }, { "epoch": 0.61, "learning_rate": 9.609411660697513e-08, "logits/chosen": -2.0798499584198, "logits/rejected": -1.9549614191055298, "logps/chosen": -24.979515075683594, "logps/rejected": -8.701518058776855, "loss": 0.5668, "rewards/accuracies": 1.0, "rewards/chosen": 1.0693531036376953, "rewards/margins": 0.2708953022956848, "rewards/rejected": 0.7984578013420105, "step": 1134 }, { "epoch": 0.61, "learning_rate": 9.6085650701323e-08, "logits/chosen": -2.014768123626709, "logits/rejected": -2.205338954925537, "logps/chosen": -1.6207644939422607, "logps/rejected": -1.7002780437469482, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.8301348686218262, "rewards/margins": 0.005126297473907471, "rewards/rejected": 0.8250085711479187, "step": 1135 }, { "epoch": 0.61, "learning_rate": 9.607717600452794e-08, "logits/chosen": -2.0223424434661865, "logits/rejected": -2.2601239681243896, "logps/chosen": -2.653257131576538, "logps/rejected": -2.588589906692505, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 0.8272635340690613, "rewards/margins": -0.017224252223968506, "rewards/rejected": 0.8444877862930298, "step": 1136 }, { "epoch": 0.61, "learning_rate": 9.606869251820652e-08, "logits/chosen": -2.0233635902404785, "logits/rejected": -1.9779846668243408, "logps/chosen": -12.235723495483398, "logps/rejected": -7.89937686920166, "loss": 0.5674, "rewards/accuracies": 1.0, "rewards/chosen": 1.0681581497192383, "rewards/margins": 0.26963573694229126, "rewards/rejected": 0.798522412776947, "step": 1137 }, { "epoch": 0.61, "learning_rate": 9.606020024397705e-08, "logits/chosen": -1.999860405921936, "logits/rejected": -2.203986644744873, "logps/chosen": -1.310516119003296, "logps/rejected": -1.3593195676803589, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.8318115472793579, "rewards/margins": 0.009182572364807129, "rewards/rejected": 0.8226289749145508, "step": 1138 }, { "epoch": 0.61, "learning_rate": 9.605169918345947e-08, "logits/chosen": -2.081054925918579, "logits/rejected": -2.084402561187744, "logps/chosen": -1.0879521369934082, "logps/rejected": -2.850040912628174, "loss": 0.5194, "rewards/accuracies": 1.0, "rewards/chosen": 1.038918137550354, "rewards/margins": 0.3841724991798401, "rewards/rejected": 0.6547456383705139, "step": 1139 }, { "epoch": 0.61, "learning_rate": 9.604318933827544e-08, "logits/chosen": -2.0420355796813965, "logits/rejected": -2.051649808883667, "logps/chosen": -2.092839241027832, "logps/rejected": -2.5545876026153564, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": 1.0621854066848755, "rewards/margins": 0.3988350033760071, "rewards/rejected": 0.6633504033088684, "step": 1140 }, { "epoch": 0.62, "learning_rate": 9.603467071004824e-08, "logits/chosen": -1.9669405221939087, "logits/rejected": -1.963287591934204, "logps/chosen": -7.56395149230957, "logps/rejected": -2.304340124130249, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 1.4914791584014893, "rewards/margins": 0.7972519993782043, "rewards/rejected": 0.6942271590232849, "step": 1141 }, { "epoch": 0.62, "learning_rate": 9.602614330040289e-08, "logits/chosen": -2.0377511978149414, "logits/rejected": -2.2473742961883545, "logps/chosen": -0.7126085758209229, "logps/rejected": -0.6341487169265747, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.8030579686164856, "rewards/margins": -0.00014144182205200195, "rewards/rejected": 0.8031994104385376, "step": 1142 }, { "epoch": 0.62, "learning_rate": 9.6017607110966e-08, "logits/chosen": -2.0365028381347656, "logits/rejected": -2.246948003768921, "logps/chosen": -3.1955389976501465, "logps/rejected": -2.970245361328125, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.7835620641708374, "rewards/margins": 0.000849306583404541, "rewards/rejected": 0.7827127575874329, "step": 1143 }, { "epoch": 0.62, "learning_rate": 9.600906214336595e-08, "logits/chosen": -2.118788480758667, "logits/rejected": -2.2917914390563965, "logps/chosen": -8.872701644897461, "logps/rejected": -8.535887718200684, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 1.010738730430603, "rewards/margins": 0.015639007091522217, "rewards/rejected": 0.9950997233390808, "step": 1144 }, { "epoch": 0.62, "learning_rate": 9.600050839923273e-08, "logits/chosen": -2.143362283706665, "logits/rejected": -2.248683452606201, "logps/chosen": -2.028439521789551, "logps/rejected": -1.9212509393692017, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8374972343444824, "rewards/margins": 0.02220219373703003, "rewards/rejected": 0.8152950406074524, "step": 1145 }, { "epoch": 0.62, "learning_rate": 9.599194588019801e-08, "logits/chosen": -1.9439258575439453, "logits/rejected": -1.9184142351150513, "logps/chosen": -15.208444595336914, "logps/rejected": -3.7421069145202637, "loss": 0.4306, "rewards/accuracies": 1.0, "rewards/chosen": 1.0979326963424683, "rewards/margins": 0.6195991635322571, "rewards/rejected": 0.4783335328102112, "step": 1146 }, { "epoch": 0.62, "learning_rate": 9.598337458789517e-08, "logits/chosen": -2.1050848960876465, "logits/rejected": -2.3063294887542725, "logps/chosen": -1.2514572143554688, "logps/rejected": -1.2096747159957886, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9379714131355286, "rewards/margins": 0.012524127960205078, "rewards/rejected": 0.9254472851753235, "step": 1147 }, { "epoch": 0.62, "learning_rate": 9.597479452395922e-08, "logits/chosen": -2.071049451828003, "logits/rejected": -2.097461223602295, "logps/chosen": -5.132946968078613, "logps/rejected": -6.485964775085449, "loss": 0.4044, "rewards/accuracies": 1.0, "rewards/chosen": 1.4336203336715698, "rewards/margins": 0.6962953209877014, "rewards/rejected": 0.7373250126838684, "step": 1148 }, { "epoch": 0.62, "learning_rate": 9.59662056900269e-08, "logits/chosen": -2.0111804008483887, "logits/rejected": -2.203894853591919, "logps/chosen": -0.6930499076843262, "logps/rejected": -0.8264521360397339, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.9263399243354797, "rewards/margins": 0.006134152412414551, "rewards/rejected": 0.9202057719230652, "step": 1149 }, { "epoch": 0.62, "learning_rate": 9.595760808773655e-08, "logits/chosen": -2.10189151763916, "logits/rejected": -2.1010406017303467, "logps/chosen": -1.4945287704467773, "logps/rejected": -3.6279361248016357, "loss": 0.5314, "rewards/accuracies": 1.0, "rewards/chosen": 0.8823796510696411, "rewards/margins": 0.3548809885978699, "rewards/rejected": 0.5274986624717712, "step": 1150 }, { "epoch": 0.62, "learning_rate": 9.594900171872823e-08, "logits/chosen": -2.1264493465423584, "logits/rejected": -2.1323111057281494, "logps/chosen": -1.388541579246521, "logps/rejected": -2.188275098800659, "loss": 0.525, "rewards/accuracies": 1.0, "rewards/chosen": 0.9045941233634949, "rewards/margins": 0.3704778552055359, "rewards/rejected": 0.534116268157959, "step": 1151 }, { "epoch": 0.62, "learning_rate": 9.594038658464367e-08, "logits/chosen": -1.9351906776428223, "logits/rejected": -1.9297268390655518, "logps/chosen": -3.976583242416382, "logps/rejected": -5.045840740203857, "loss": 0.5837, "rewards/accuracies": 1.0, "rewards/chosen": 0.7320584654808044, "rewards/margins": 0.23240554332733154, "rewards/rejected": 0.4996529221534729, "step": 1152 }, { "epoch": 0.62, "learning_rate": 9.593176268712625e-08, "logits/chosen": -2.070748805999756, "logits/rejected": -2.07867431640625, "logps/chosen": -2.0679855346679688, "logps/rejected": -2.789182662963867, "loss": 0.506, "rewards/accuracies": 1.0, "rewards/chosen": 0.9049882292747498, "rewards/margins": 0.41749152541160583, "rewards/rejected": 0.4874967038631439, "step": 1153 }, { "epoch": 0.62, "learning_rate": 9.592313002782105e-08, "logits/chosen": -1.9123166799545288, "logits/rejected": -2.229163885116577, "logps/chosen": -0.48569414019584656, "logps/rejected": -0.5106716156005859, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.8547679781913757, "rewards/margins": 0.039215803146362305, "rewards/rejected": 0.8155521750450134, "step": 1154 }, { "epoch": 0.62, "learning_rate": 9.59144886083748e-08, "logits/chosen": -1.9739840030670166, "logits/rejected": -2.220299482345581, "logps/chosen": -1.2844430208206177, "logps/rejected": -1.3240547180175781, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.9585553407669067, "rewards/margins": 8.022785186767578e-05, "rewards/rejected": 0.9584751129150391, "step": 1155 }, { "epoch": 0.62, "learning_rate": 9.59058384304359e-08, "logits/chosen": -2.025970935821533, "logits/rejected": -2.2559170722961426, "logps/chosen": -0.3752022385597229, "logps/rejected": -0.4556891620159149, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.9698851704597473, "rewards/margins": 0.006209194660186768, "rewards/rejected": 0.9636759757995605, "step": 1156 }, { "epoch": 0.62, "learning_rate": 9.589717949565446e-08, "logits/chosen": -2.0252182483673096, "logits/rejected": -2.0183703899383545, "logps/chosen": -5.708870887756348, "logps/rejected": -6.429193019866943, "loss": 0.353, "rewards/accuracies": 1.0, "rewards/chosen": 1.3938196897506714, "rewards/margins": 0.8595638275146484, "rewards/rejected": 0.534255862236023, "step": 1157 }, { "epoch": 0.62, "learning_rate": 9.588851180568218e-08, "logits/chosen": -2.0955758094787598, "logits/rejected": -2.2413744926452637, "logps/chosen": -1.584006428718567, "logps/rejected": -1.5086467266082764, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0449546575546265, "rewards/margins": 0.013633370399475098, "rewards/rejected": 1.0313212871551514, "step": 1158 }, { "epoch": 0.63, "learning_rate": 9.587983536217253e-08, "logits/chosen": -2.203620672225952, "logits/rejected": -2.035477876663208, "logps/chosen": -70.4917221069336, "logps/rejected": -0.37546849250793457, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.4922370910644531, "rewards/margins": 0.6482965350151062, "rewards/rejected": 0.8439405560493469, "step": 1159 }, { "epoch": 0.63, "learning_rate": 9.587115016678055e-08, "logits/chosen": -2.1155543327331543, "logits/rejected": -2.1140620708465576, "logps/chosen": -4.873162269592285, "logps/rejected": -4.371789455413818, "loss": 0.5586, "rewards/accuracies": 1.0, "rewards/chosen": 1.0416029691696167, "rewards/margins": 0.29009294509887695, "rewards/rejected": 0.7515100240707397, "step": 1160 }, { "epoch": 0.63, "learning_rate": 9.586245622116303e-08, "logits/chosen": -2.0262246131896973, "logits/rejected": -2.02315616607666, "logps/chosen": -4.400116443634033, "logps/rejected": -7.631702423095703, "loss": 0.4841, "rewards/accuracies": 1.0, "rewards/chosen": 1.058882713317871, "rewards/margins": 0.4735459089279175, "rewards/rejected": 0.5853368043899536, "step": 1161 }, { "epoch": 0.63, "learning_rate": 9.58537535269784e-08, "logits/chosen": -2.0428402423858643, "logits/rejected": -2.0501372814178467, "logps/chosen": -3.3050131797790527, "logps/rejected": -1.3301678895950317, "loss": 0.4551, "rewards/accuracies": 1.0, "rewards/chosen": 1.2899973392486572, "rewards/margins": 0.5509965419769287, "rewards/rejected": 0.7390007972717285, "step": 1162 }, { "epoch": 0.63, "learning_rate": 9.584504208588673e-08, "logits/chosen": -2.070810556411743, "logits/rejected": -2.28157639503479, "logps/chosen": -0.6370769143104553, "logps/rejected": -0.7320992350578308, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.8377401232719421, "rewards/margins": 0.013991057872772217, "rewards/rejected": 0.8237490653991699, "step": 1163 }, { "epoch": 0.63, "learning_rate": 9.583632189954982e-08, "logits/chosen": -2.1386163234710693, "logits/rejected": -2.297089099884033, "logps/chosen": -0.8328000903129578, "logps/rejected": -0.9293737411499023, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 1.0812366008758545, "rewards/margins": 0.014203906059265137, "rewards/rejected": 1.0670326948165894, "step": 1164 }, { "epoch": 0.63, "learning_rate": 9.582759296963109e-08, "logits/chosen": -1.9822837114334106, "logits/rejected": -2.2612454891204834, "logps/chosen": -10.351851463317871, "logps/rejected": -8.363466262817383, "loss": 0.749, "rewards/accuracies": 0.0, "rewards/chosen": 0.8056477904319763, "rewards/margins": -0.10879600048065186, "rewards/rejected": 0.9144437909126282, "step": 1165 }, { "epoch": 0.63, "learning_rate": 9.581885529779563e-08, "logits/chosen": -2.200599193572998, "logits/rejected": -2.2924532890319824, "logps/chosen": -2.31335711479187, "logps/rejected": -2.3796801567077637, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.9600011110305786, "rewards/margins": 0.01602548360824585, "rewards/rejected": 0.9439756274223328, "step": 1166 }, { "epoch": 0.63, "learning_rate": 9.581010888571021e-08, "logits/chosen": -2.106340169906616, "logits/rejected": -2.045821189880371, "logps/chosen": -33.422733306884766, "logps/rejected": -2.091759443283081, "loss": 0.3973, "rewards/accuracies": 1.0, "rewards/chosen": 1.5222141742706299, "rewards/margins": 0.7178592085838318, "rewards/rejected": 0.8043549656867981, "step": 1167 }, { "epoch": 0.63, "learning_rate": 9.580135373504328e-08, "logits/chosen": -1.9475252628326416, "logits/rejected": -2.2355048656463623, "logps/chosen": -1.6530025005340576, "logps/rejected": -1.5797590017318726, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.9360093474388123, "rewards/margins": -0.0053139328956604, "rewards/rejected": 0.9413232803344727, "step": 1168 }, { "epoch": 0.63, "learning_rate": 9.579258984746493e-08, "logits/chosen": -2.0303564071655273, "logits/rejected": -2.2726669311523438, "logps/chosen": -0.6671500205993652, "logps/rejected": -0.641265332698822, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.6769890785217285, "rewards/margins": 0.03639155626296997, "rewards/rejected": 0.6405975222587585, "step": 1169 }, { "epoch": 0.63, "learning_rate": 9.578381722464692e-08, "logits/chosen": -2.0163464546203613, "logits/rejected": -2.229374408721924, "logps/chosen": -1.9908427000045776, "logps/rejected": -2.0646133422851562, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 1.156293272972107, "rewards/margins": 0.018741607666015625, "rewards/rejected": 1.1375516653060913, "step": 1170 }, { "epoch": 0.63, "learning_rate": 9.577503586826272e-08, "logits/chosen": -2.0120577812194824, "logits/rejected": -2.232259511947632, "logps/chosen": -6.1683454513549805, "logps/rejected": -1.6645087003707886, "loss": 0.8152, "rewards/accuracies": 0.0, "rewards/chosen": 0.6379387974739075, "rewards/margins": -0.23077619075775146, "rewards/rejected": 0.8687149882316589, "step": 1171 }, { "epoch": 0.63, "learning_rate": 9.57662457799874e-08, "logits/chosen": -2.0832157135009766, "logits/rejected": -2.279569625854492, "logps/chosen": -0.5724600553512573, "logps/rejected": -0.5997570753097534, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.7941145300865173, "rewards/margins": 0.0020813941955566406, "rewards/rejected": 0.7920331358909607, "step": 1172 }, { "epoch": 0.63, "learning_rate": 9.575744696149774e-08, "logits/chosen": -2.0041043758392334, "logits/rejected": -2.0087811946868896, "logps/chosen": -2.3209943771362305, "logps/rejected": -3.2609424591064453, "loss": 0.5341, "rewards/accuracies": 1.0, "rewards/chosen": 0.9111594557762146, "rewards/margins": 0.34824395179748535, "rewards/rejected": 0.5629155039787292, "step": 1173 }, { "epoch": 0.63, "learning_rate": 9.574863941447219e-08, "logits/chosen": -2.162315845489502, "logits/rejected": -2.1526546478271484, "logps/chosen": -3.436593532562256, "logps/rejected": -2.829674243927002, "loss": 0.4618, "rewards/accuracies": 1.0, "rewards/chosen": 1.1227020025253296, "rewards/margins": 0.5328904390335083, "rewards/rejected": 0.5898115634918213, "step": 1174 }, { "epoch": 0.63, "learning_rate": 9.573982314059081e-08, "logits/chosen": -2.071110248565674, "logits/rejected": -2.2922587394714355, "logps/chosen": -1.8201032876968384, "logps/rejected": -1.9634983539581299, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.5627784729003906, "rewards/margins": 0.024425864219665527, "rewards/rejected": 0.5383526086807251, "step": 1175 }, { "epoch": 0.63, "learning_rate": 9.57309981415354e-08, "logits/chosen": -2.1996798515319824, "logits/rejected": -2.2247095108032227, "logps/chosen": -1.8263391256332397, "logps/rejected": -1.948486566543579, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 0.8416563868522644, "rewards/margins": 0.035655975341796875, "rewards/rejected": 0.8060004115104675, "step": 1176 }, { "epoch": 0.63, "learning_rate": 9.572216441898935e-08, "logits/chosen": -2.0935747623443604, "logits/rejected": -2.0971579551696777, "logps/chosen": -4.372669696807861, "logps/rejected": -3.0115623474121094, "loss": 0.4845, "rewards/accuracies": 1.0, "rewards/chosen": 1.1567553281784058, "rewards/margins": 0.47262853384017944, "rewards/rejected": 0.6841267943382263, "step": 1177 }, { "epoch": 0.64, "learning_rate": 9.571332197463779e-08, "logits/chosen": -2.0390501022338867, "logits/rejected": -2.0485734939575195, "logps/chosen": -8.568989753723145, "logps/rejected": -2.4126169681549072, "loss": 0.5576, "rewards/accuracies": 1.0, "rewards/chosen": 1.0424513816833496, "rewards/margins": 0.2924985885620117, "rewards/rejected": 0.7499527931213379, "step": 1178 }, { "epoch": 0.64, "learning_rate": 9.570447081016743e-08, "logits/chosen": -1.9795403480529785, "logits/rejected": -2.259648323059082, "logps/chosen": -1.4706275463104248, "logps/rejected": -1.516648530960083, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 1.0406397581100464, "rewards/margins": 0.0033898353576660156, "rewards/rejected": 1.0372499227523804, "step": 1179 }, { "epoch": 0.64, "learning_rate": 9.569561092726674e-08, "logits/chosen": -2.227118968963623, "logits/rejected": -2.2184510231018066, "logps/chosen": -1.614061713218689, "logps/rejected": -3.27838397026062, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427657127380371, "rewards/margins": 0.40214282274246216, "rewards/rejected": 0.540622889995575, "step": 1180 }, { "epoch": 0.64, "learning_rate": 9.568674232762577e-08, "logits/chosen": -1.9266958236694336, "logits/rejected": -1.943358302116394, "logps/chosen": -4.369081974029541, "logps/rejected": -5.604552268981934, "loss": 0.6303, "rewards/accuracies": 1.0, "rewards/chosen": 0.9325632452964783, "rewards/margins": 0.12998515367507935, "rewards/rejected": 0.8025780916213989, "step": 1181 }, { "epoch": 0.64, "learning_rate": 9.567786501293626e-08, "logits/chosen": -2.045039176940918, "logits/rejected": -2.3491570949554443, "logps/chosen": -2.791633129119873, "logps/rejected": -3.0254783630371094, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.922702431678772, "rewards/margins": 0.015735745429992676, "rewards/rejected": 0.9069666862487793, "step": 1182 }, { "epoch": 0.64, "learning_rate": 9.566897898489163e-08, "logits/chosen": -2.004333019256592, "logits/rejected": -2.3774707317352295, "logps/chosen": -0.6688379645347595, "logps/rejected": -31.666627883911133, "loss": 0.5143, "rewards/accuracies": 1.0, "rewards/chosen": 1.035513162612915, "rewards/margins": 0.39690667390823364, "rewards/rejected": 0.6386064887046814, "step": 1183 }, { "epoch": 0.64, "learning_rate": 9.566008424518695e-08, "logits/chosen": -2.0087709426879883, "logits/rejected": -2.0030651092529297, "logps/chosen": -7.042726993560791, "logps/rejected": -2.8730998039245605, "loss": 0.4718, "rewards/accuracies": 1.0, "rewards/chosen": 1.1878975629806519, "rewards/margins": 0.5059171915054321, "rewards/rejected": 0.6819803714752197, "step": 1184 }, { "epoch": 0.64, "learning_rate": 9.565118079551895e-08, "logits/chosen": -1.9699920415878296, "logits/rejected": -2.3206801414489746, "logps/chosen": -5.106575012207031, "logps/rejected": -5.347014904022217, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.899823009967804, "rewards/margins": 0.010468125343322754, "rewards/rejected": 0.8893548846244812, "step": 1185 }, { "epoch": 0.64, "learning_rate": 9.5642268637586e-08, "logits/chosen": -2.1371357440948486, "logits/rejected": -2.141247510910034, "logps/chosen": -2.617459297180176, "logps/rejected": -1.8266408443450928, "loss": 0.6359, "rewards/accuracies": 1.0, "rewards/chosen": 0.9887188076972961, "rewards/margins": 0.11788690090179443, "rewards/rejected": 0.8708319067955017, "step": 1186 }, { "epoch": 0.64, "learning_rate": 9.563334777308818e-08, "logits/chosen": -2.054358959197998, "logits/rejected": -2.0462820529937744, "logps/chosen": -6.360579013824463, "logps/rejected": -1.8223013877868652, "loss": 0.4753, "rewards/accuracies": 1.0, "rewards/chosen": 1.404732346534729, "rewards/margins": 0.4966387152671814, "rewards/rejected": 0.9080936312675476, "step": 1187 }, { "epoch": 0.64, "learning_rate": 9.562441820372719e-08, "logits/chosen": -2.1246724128723145, "logits/rejected": -2.2889106273651123, "logps/chosen": -7.269908905029297, "logps/rejected": -6.351141929626465, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.7134693264961243, "rewards/margins": 0.006039142608642578, "rewards/rejected": 0.7074301838874817, "step": 1188 }, { "epoch": 0.64, "learning_rate": 9.56154799312064e-08, "logits/chosen": -2.0828018188476562, "logits/rejected": -2.2321276664733887, "logps/chosen": -0.848418653011322, "logps/rejected": -0.8881620764732361, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.8332716822624207, "rewards/margins": 0.011673688888549805, "rewards/rejected": 0.8215979933738708, "step": 1189 }, { "epoch": 0.64, "learning_rate": 9.560653295723086e-08, "logits/chosen": -2.094526529312134, "logits/rejected": -2.1306357383728027, "logps/chosen": -4.391458988189697, "logps/rejected": -7.296517372131348, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.8484066128730774, "rewards/margins": 0.023010551929473877, "rewards/rejected": 0.8253960609436035, "step": 1190 }, { "epoch": 0.64, "learning_rate": 9.559757728350727e-08, "logits/chosen": -2.0457565784454346, "logits/rejected": -2.277747631072998, "logps/chosen": -0.8135164976119995, "logps/rejected": -0.908428966999054, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 1.0367792844772339, "rewards/margins": 0.0010933876037597656, "rewards/rejected": 1.0356858968734741, "step": 1191 }, { "epoch": 0.64, "learning_rate": 9.558861291174395e-08, "logits/chosen": -2.0112972259521484, "logits/rejected": -1.9938437938690186, "logps/chosen": -8.123430252075195, "logps/rejected": -4.429093360900879, "loss": 0.3932, "rewards/accuracies": 1.0, "rewards/chosen": 1.2566698789596558, "rewards/margins": 0.730269193649292, "rewards/rejected": 0.5264006853103638, "step": 1192 }, { "epoch": 0.64, "learning_rate": 9.557963984365096e-08, "logits/chosen": -2.018894672393799, "logits/rejected": -2.189056873321533, "logps/chosen": -2.506934404373169, "logps/rejected": -2.238654613494873, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.5602256655693054, "rewards/margins": 0.013922631740570068, "rewards/rejected": 0.5463030338287354, "step": 1193 }, { "epoch": 0.64, "learning_rate": 9.557065808093992e-08, "logits/chosen": -2.1394596099853516, "logits/rejected": -2.139648675918579, "logps/chosen": -1.9270457029342651, "logps/rejected": -5.870410919189453, "loss": 0.6682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9973499178886414, "rewards/margins": 0.050524890422821045, "rewards/rejected": 0.9468250274658203, "step": 1194 }, { "epoch": 0.64, "learning_rate": 9.55616676253242e-08, "logits/chosen": -1.9891319274902344, "logits/rejected": -1.98422110080719, "logps/chosen": -4.176350116729736, "logps/rejected": -3.6933536529541016, "loss": 0.3792, "rewards/accuracies": 1.0, "rewards/chosen": 1.3222957849502563, "rewards/margins": 0.7742300033569336, "rewards/rejected": 0.5480657815933228, "step": 1195 }, { "epoch": 0.65, "learning_rate": 9.555266847851879e-08, "logits/chosen": -1.9993656873703003, "logits/rejected": -2.255089044570923, "logps/chosen": -0.6741642355918884, "logps/rejected": -0.6633067727088928, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.9844304323196411, "rewards/margins": 0.040451228618621826, "rewards/rejected": 0.9439792037010193, "step": 1196 }, { "epoch": 0.65, "learning_rate": 9.554366064224031e-08, "logits/chosen": -2.0391674041748047, "logits/rejected": -2.254603385925293, "logps/chosen": -0.941163957118988, "logps/rejected": -0.9995688199996948, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.7019773125648499, "rewards/margins": 0.016961872577667236, "rewards/rejected": 0.6850154399871826, "step": 1197 }, { "epoch": 0.65, "learning_rate": 9.553464411820708e-08, "logits/chosen": -2.0392043590545654, "logits/rejected": -2.286802053451538, "logps/chosen": -4.389690399169922, "logps/rejected": -4.76806640625, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.6401836276054382, "rewards/margins": 0.015734851360321045, "rewards/rejected": 0.6244487762451172, "step": 1198 }, { "epoch": 0.65, "learning_rate": 9.552561890813906e-08, "logits/chosen": -1.934646725654602, "logits/rejected": -2.2114813327789307, "logps/chosen": -0.7405033707618713, "logps/rejected": -0.7677947878837585, "loss": 0.7074, "rewards/accuracies": 0.0, "rewards/chosen": 0.8953332304954529, "rewards/margins": -0.028385818004608154, "rewards/rejected": 0.923719048500061, "step": 1199 }, { "epoch": 0.65, "learning_rate": 9.551658501375785e-08, "logits/chosen": -2.13161563873291, "logits/rejected": -2.29756760597229, "logps/chosen": -0.5428347587585449, "logps/rejected": -0.5048588514328003, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.8322674036026001, "rewards/margins": 0.005414426326751709, "rewards/rejected": 0.8268529772758484, "step": 1200 }, { "epoch": 0.65, "learning_rate": 9.550754243678678e-08, "logits/chosen": -2.043813705444336, "logits/rejected": -2.049144744873047, "logps/chosen": -0.6523950099945068, "logps/rejected": -5.447596073150635, "loss": 0.4602, "rewards/accuracies": 1.0, "rewards/chosen": 0.9573394060134888, "rewards/margins": 0.537171483039856, "rewards/rejected": 0.4201678931713104, "step": 1201 }, { "epoch": 0.65, "learning_rate": 9.549849117895073e-08, "logits/chosen": -2.0492117404937744, "logits/rejected": -2.261714458465576, "logps/chosen": -1.130988597869873, "logps/rejected": -1.1147853136062622, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.8685365915298462, "rewards/margins": 0.02446073293685913, "rewards/rejected": 0.8440758585929871, "step": 1202 }, { "epoch": 0.65, "learning_rate": 9.548943124197629e-08, "logits/chosen": -2.013334035873413, "logits/rejected": -2.20418643951416, "logps/chosen": -2.428701162338257, "logps/rejected": -2.293393611907959, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.7059102058410645, "rewards/margins": 0.01240450143814087, "rewards/rejected": 0.6935057044029236, "step": 1203 }, { "epoch": 0.65, "learning_rate": 9.548036262759172e-08, "logits/chosen": -2.0824992656707764, "logits/rejected": -2.033176898956299, "logps/chosen": -34.925880432128906, "logps/rejected": -3.7051918506622314, "loss": 0.4874, "rewards/accuracies": 1.0, "rewards/chosen": 1.082147240638733, "rewards/margins": 0.4651803970336914, "rewards/rejected": 0.6169668436050415, "step": 1204 }, { "epoch": 0.65, "learning_rate": 9.54712853375269e-08, "logits/chosen": -2.103271722793579, "logits/rejected": -2.030372381210327, "logps/chosen": -10.527918815612793, "logps/rejected": -1.4659448862075806, "loss": 0.5414, "rewards/accuracies": 1.0, "rewards/chosen": 1.2515860795974731, "rewards/margins": 0.33069175481796265, "rewards/rejected": 0.9208943247795105, "step": 1205 }, { "epoch": 0.65, "learning_rate": 9.546219937351342e-08, "logits/chosen": -2.019727945327759, "logits/rejected": -2.2951254844665527, "logps/chosen": -1.6011548042297363, "logps/rejected": -1.420361042022705, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 1.0689634084701538, "rewards/margins": 0.008415937423706055, "rewards/rejected": 1.0605474710464478, "step": 1206 }, { "epoch": 0.65, "learning_rate": 9.545310473728444e-08, "logits/chosen": -2.129046678543091, "logits/rejected": -2.2338144779205322, "logps/chosen": -7.272000789642334, "logps/rejected": -5.196902275085449, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.8364826440811157, "rewards/margins": 0.030391037464141846, "rewards/rejected": 0.8060916066169739, "step": 1207 }, { "epoch": 0.65, "learning_rate": 9.544400143057489e-08, "logits/chosen": -2.0697953701019287, "logits/rejected": -2.068384885787964, "logps/chosen": -4.355313777923584, "logps/rejected": -1.92798912525177, "loss": 0.5892, "rewards/accuracies": 1.0, "rewards/chosen": 1.0407308340072632, "rewards/margins": 0.22004103660583496, "rewards/rejected": 0.8206897974014282, "step": 1208 }, { "epoch": 0.65, "learning_rate": 9.543488945512121e-08, "logits/chosen": -2.0668230056762695, "logits/rejected": -2.211808919906616, "logps/chosen": -1.1153030395507812, "logps/rejected": -1.2104706764221191, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.8953900337219238, "rewards/margins": 0.026072204113006592, "rewards/rejected": 0.8693178296089172, "step": 1209 }, { "epoch": 0.65, "learning_rate": 9.54257688126616e-08, "logits/chosen": -1.9763184785842896, "logits/rejected": -1.9854252338409424, "logps/chosen": -3.1609652042388916, "logps/rejected": -4.136143684387207, "loss": 0.4437, "rewards/accuracies": 1.0, "rewards/chosen": 1.325622797012329, "rewards/margins": 0.5824336409568787, "rewards/rejected": 0.7431891560554504, "step": 1210 }, { "epoch": 0.65, "learning_rate": 9.541663950493589e-08, "logits/chosen": -2.0906574726104736, "logits/rejected": -1.9943159818649292, "logps/chosen": -46.03700256347656, "logps/rejected": -3.487074375152588, "loss": 0.3746, "rewards/accuracies": 1.0, "rewards/chosen": 1.504541039466858, "rewards/margins": 0.7886080145835876, "rewards/rejected": 0.7159330248832703, "step": 1211 }, { "epoch": 0.65, "learning_rate": 9.540750153368556e-08, "logits/chosen": -2.1643502712249756, "logits/rejected": -2.276559352874756, "logps/chosen": -0.5380656719207764, "logps/rejected": -0.5546900033950806, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 1.033119797706604, "rewards/margins": 0.01991450786590576, "rewards/rejected": 1.0132052898406982, "step": 1212 }, { "epoch": 0.65, "learning_rate": 9.539835490065371e-08, "logits/chosen": -2.04461407661438, "logits/rejected": -2.291465997695923, "logps/chosen": -3.9503326416015625, "logps/rejected": -3.9182791709899902, "loss": 0.6717, "rewards/accuracies": 1.0, "rewards/chosen": 0.745248019695282, "rewards/margins": 0.043428659439086914, "rewards/rejected": 0.7018193602561951, "step": 1213 }, { "epoch": 0.65, "learning_rate": 9.538919960758515e-08, "logits/chosen": -2.0918819904327393, "logits/rejected": -2.1938741207122803, "logps/chosen": -22.973731994628906, "logps/rejected": -21.700695037841797, "loss": 0.4454, "rewards/accuracies": 1.0, "rewards/chosen": 1.2472705841064453, "rewards/margins": 0.5777532458305359, "rewards/rejected": 0.6695173382759094, "step": 1214 }, { "epoch": 0.66, "learning_rate": 9.53800356562263e-08, "logits/chosen": -2.0494019985198975, "logits/rejected": -2.0438244342803955, "logps/chosen": -5.0285539627075195, "logps/rejected": -3.1792092323303223, "loss": 0.5696, "rewards/accuracies": 1.0, "rewards/chosen": 0.8167201280593872, "rewards/margins": 0.26454514265060425, "rewards/rejected": 0.552174985408783, "step": 1215 }, { "epoch": 0.66, "learning_rate": 9.537086304832526e-08, "logits/chosen": -2.0647592544555664, "logits/rejected": -2.2793588638305664, "logps/chosen": -0.6715245246887207, "logps/rejected": -0.7111865878105164, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.8931171298027039, "rewards/margins": 0.020287692546844482, "rewards/rejected": 0.8728294372558594, "step": 1216 }, { "epoch": 0.66, "learning_rate": 9.536168178563173e-08, "logits/chosen": -2.2161242961883545, "logits/rejected": -2.142169713973999, "logps/chosen": -26.156131744384766, "logps/rejected": -6.0849690437316895, "loss": 0.5961, "rewards/accuracies": 1.0, "rewards/chosen": 1.016234278678894, "rewards/margins": 0.20450139045715332, "rewards/rejected": 0.8117328882217407, "step": 1217 }, { "epoch": 0.66, "learning_rate": 9.535249186989713e-08, "logits/chosen": -2.006476879119873, "logits/rejected": -2.005197286605835, "logps/chosen": -0.7519459128379822, "logps/rejected": -2.6952009201049805, "loss": 0.5342, "rewards/accuracies": 1.0, "rewards/chosen": 1.018239974975586, "rewards/margins": 0.34812116622924805, "rewards/rejected": 0.6701188087463379, "step": 1218 }, { "epoch": 0.66, "learning_rate": 9.53432933028745e-08, "logits/chosen": -2.2156331539154053, "logits/rejected": -2.074620485305786, "logps/chosen": -62.312957763671875, "logps/rejected": -25.504247665405273, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": 1.7130523920059204, "rewards/margins": 1.0517377853393555, "rewards/rejected": 0.6613146066665649, "step": 1219 }, { "epoch": 0.66, "learning_rate": 9.53340860863185e-08, "logits/chosen": -1.9588370323181152, "logits/rejected": -1.9722862243652344, "logps/chosen": -2.8209784030914307, "logps/rejected": -6.370959758758545, "loss": 0.4661, "rewards/accuracies": 1.0, "rewards/chosen": 1.1797001361846924, "rewards/margins": 0.5213150978088379, "rewards/rejected": 0.6583850383758545, "step": 1220 }, { "epoch": 0.66, "learning_rate": 9.532487022198549e-08, "logits/chosen": -2.1704041957855225, "logits/rejected": -2.160892963409424, "logps/chosen": -4.093863010406494, "logps/rejected": -9.126970291137695, "loss": 0.6629, "rewards/accuracies": 1.0, "rewards/chosen": 0.7878490090370178, "rewards/margins": 0.061346590518951416, "rewards/rejected": 0.7265024185180664, "step": 1221 }, { "epoch": 0.66, "learning_rate": 9.531564571163345e-08, "logits/chosen": -1.964410662651062, "logits/rejected": -2.2358951568603516, "logps/chosen": -3.536445379257202, "logps/rejected": -3.442910671234131, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 1.063227891921997, "rewards/margins": 0.022878289222717285, "rewards/rejected": 1.0403496026992798, "step": 1222 }, { "epoch": 0.66, "learning_rate": 9.530641255702201e-08, "logits/chosen": -2.3062798976898193, "logits/rejected": -2.0720765590667725, "logps/chosen": -55.3516731262207, "logps/rejected": -4.896308422088623, "loss": 0.3857, "rewards/accuracies": 1.0, "rewards/chosen": 1.3229175806045532, "rewards/margins": 0.7535821795463562, "rewards/rejected": 0.569335401058197, "step": 1223 }, { "epoch": 0.66, "learning_rate": 9.529717075991245e-08, "logits/chosen": -2.0831336975097656, "logits/rejected": -2.2379794120788574, "logps/chosen": -0.8077779412269592, "logps/rejected": -0.9873377680778503, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 1.0094648599624634, "rewards/margins": 0.014286458492279053, "rewards/rejected": 0.9951784014701843, "step": 1224 }, { "epoch": 0.66, "learning_rate": 9.528792032206772e-08, "logits/chosen": -2.0654239654541016, "logits/rejected": -2.067199945449829, "logps/chosen": -4.545506000518799, "logps/rejected": -13.36572551727295, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 1.095768928527832, "rewards/margins": 0.9958252906799316, "rewards/rejected": 0.09994363784790039, "step": 1225 }, { "epoch": 0.66, "learning_rate": 9.527866124525241e-08, "logits/chosen": -2.0138185024261475, "logits/rejected": -2.0086288452148438, "logps/chosen": -5.646265506744385, "logps/rejected": -2.5560245513916016, "loss": 0.487, "rewards/accuracies": 1.0, "rewards/chosen": 1.2118574380874634, "rewards/margins": 0.4661552309989929, "rewards/rejected": 0.7457022070884705, "step": 1226 }, { "epoch": 0.66, "learning_rate": 9.526939353123273e-08, "logits/chosen": -2.2054176330566406, "logits/rejected": -2.077381134033203, "logps/chosen": -42.406063079833984, "logps/rejected": -0.4206477999687195, "loss": 0.2639, "rewards/accuracies": 1.0, "rewards/chosen": 2.005100727081299, "rewards/margins": 1.1975406408309937, "rewards/rejected": 0.8075600862503052, "step": 1227 }, { "epoch": 0.66, "learning_rate": 9.526011718177656e-08, "logits/chosen": -2.139725923538208, "logits/rejected": -2.2601852416992188, "logps/chosen": -1.236236810684204, "logps/rejected": -1.268495798110962, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.9026922583580017, "rewards/margins": 0.00325852632522583, "rewards/rejected": 0.8994337320327759, "step": 1228 }, { "epoch": 0.66, "learning_rate": 9.525083219865343e-08, "logits/chosen": -2.2202916145324707, "logits/rejected": -2.2152187824249268, "logps/chosen": -8.754060745239258, "logps/rejected": -3.7123095989227295, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": 1.336718201637268, "rewards/margins": 0.8347799181938171, "rewards/rejected": 0.5019382834434509, "step": 1229 }, { "epoch": 0.66, "learning_rate": 9.52415385836345e-08, "logits/chosen": -1.9227956533432007, "logits/rejected": -2.211641311645508, "logps/chosen": -2.5883257389068604, "logps/rejected": -2.5712647438049316, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.7872015237808228, "rewards/margins": -0.013778090476989746, "rewards/rejected": 0.8009796142578125, "step": 1230 }, { "epoch": 0.66, "learning_rate": 9.523223633849263e-08, "logits/chosen": -2.1199254989624023, "logits/rejected": -2.0211691856384277, "logps/chosen": -27.59512710571289, "logps/rejected": -2.069425106048584, "loss": 0.4042, "rewards/accuracies": 1.0, "rewards/chosen": 1.4759228229522705, "rewards/margins": 0.6968168020248413, "rewards/rejected": 0.7791060209274292, "step": 1231 }, { "epoch": 0.66, "learning_rate": 9.522292546500224e-08, "logits/chosen": -2.0288615226745605, "logits/rejected": -2.0346176624298096, "logps/chosen": -2.837876558303833, "logps/rejected": -4.047950744628906, "loss": 0.4757, "rewards/accuracies": 1.0, "rewards/chosen": 1.069879412651062, "rewards/margins": 0.4957364797592163, "rewards/rejected": 0.5741429328918457, "step": 1232 }, { "epoch": 0.67, "learning_rate": 9.521360596493946e-08, "logits/chosen": -1.9529155492782593, "logits/rejected": -2.256089687347412, "logps/chosen": -4.663189888000488, "logps/rejected": -5.230515956878662, "loss": 0.656, "rewards/accuracies": 1.0, "rewards/chosen": 0.7812560200691223, "rewards/margins": 0.07562482357025146, "rewards/rejected": 0.7056311964988708, "step": 1233 }, { "epoch": 0.67, "learning_rate": 9.520427784008203e-08, "logits/chosen": -2.072873592376709, "logits/rejected": -2.2222282886505127, "logps/chosen": -2.18521785736084, "logps/rejected": -2.086900234222412, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9409840703010559, "rewards/margins": 0.02046823501586914, "rewards/rejected": 0.9205158352851868, "step": 1234 }, { "epoch": 0.67, "learning_rate": 9.519494109220938e-08, "logits/chosen": -2.019700765609741, "logits/rejected": -2.266648054122925, "logps/chosen": -0.590289831161499, "logps/rejected": -0.613338053226471, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.9293683171272278, "rewards/margins": 0.004172027111053467, "rewards/rejected": 0.9251962900161743, "step": 1235 }, { "epoch": 0.67, "learning_rate": 9.518559572310254e-08, "logits/chosen": -2.1151442527770996, "logits/rejected": -2.1173884868621826, "logps/chosen": -0.37823551893234253, "logps/rejected": -3.4512383937835693, "loss": 0.5865, "rewards/accuracies": 1.0, "rewards/chosen": 0.8249463438987732, "rewards/margins": 0.22601377964019775, "rewards/rejected": 0.5989325642585754, "step": 1236 }, { "epoch": 0.67, "learning_rate": 9.51762417345442e-08, "logits/chosen": -1.9143757820129395, "logits/rejected": -1.948107361793518, "logps/chosen": -1.767411708831787, "logps/rejected": -12.7621488571167, "loss": 0.6007, "rewards/accuracies": 1.0, "rewards/chosen": 0.9662920236587524, "rewards/margins": 0.1942088007926941, "rewards/rejected": 0.7720832228660583, "step": 1237 }, { "epoch": 0.67, "learning_rate": 9.51668791283187e-08, "logits/chosen": -2.0134987831115723, "logits/rejected": -2.0110151767730713, "logps/chosen": -1.9053406715393066, "logps/rejected": -5.195662498474121, "loss": 0.5013, "rewards/accuracies": 1.0, "rewards/chosen": 0.942656934261322, "rewards/margins": 0.4294247627258301, "rewards/rejected": 0.5132321715354919, "step": 1238 }, { "epoch": 0.67, "learning_rate": 9.5157507906212e-08, "logits/chosen": -2.103135585784912, "logits/rejected": -2.3332772254943848, "logps/chosen": -4.17760705947876, "logps/rejected": -4.249572277069092, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.4841214716434479, "rewards/margins": 0.027688324451446533, "rewards/rejected": 0.45643314719200134, "step": 1239 }, { "epoch": 0.67, "learning_rate": 9.514812807001173e-08, "logits/chosen": -2.0001108646392822, "logits/rejected": -2.004359245300293, "logps/chosen": -3.209517478942871, "logps/rejected": -1.11222505569458, "loss": 0.6309, "rewards/accuracies": 1.0, "rewards/chosen": 1.0969117879867554, "rewards/margins": 0.12862563133239746, "rewards/rejected": 0.9682861566543579, "step": 1240 }, { "epoch": 0.67, "learning_rate": 9.513873962150718e-08, "logits/chosen": -2.0657927989959717, "logits/rejected": -2.273392677307129, "logps/chosen": -1.599031686782837, "logps/rejected": -1.5040783882141113, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 1.0320745706558228, "rewards/margins": 0.028610944747924805, "rewards/rejected": 1.003463625907898, "step": 1241 }, { "epoch": 0.67, "learning_rate": 9.512934256248926e-08, "logits/chosen": -2.0134379863739014, "logits/rejected": -2.0136702060699463, "logps/chosen": -1.2859725952148438, "logps/rejected": -2.511277914047241, "loss": 0.5402, "rewards/accuracies": 1.0, "rewards/chosen": 0.9301316142082214, "rewards/margins": 0.3335835933685303, "rewards/rejected": 0.5965480208396912, "step": 1242 }, { "epoch": 0.67, "learning_rate": 9.511993689475048e-08, "logits/chosen": -2.0006980895996094, "logits/rejected": -2.230177402496338, "logps/chosen": -0.8344556093215942, "logps/rejected": -0.7693433165550232, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8149217963218689, "rewards/margins": 0.019276440143585205, "rewards/rejected": 0.7956453561782837, "step": 1243 }, { "epoch": 0.67, "learning_rate": 9.511052262008506e-08, "logits/chosen": -2.0374724864959717, "logits/rejected": -2.0400967597961426, "logps/chosen": -1.2730047702789307, "logps/rejected": -2.1859278678894043, "loss": 0.563, "rewards/accuracies": 1.0, "rewards/chosen": 0.888311505317688, "rewards/margins": 0.2798876166343689, "rewards/rejected": 0.6084238886833191, "step": 1244 }, { "epoch": 0.67, "learning_rate": 9.510109974028884e-08, "logits/chosen": -1.9467743635177612, "logits/rejected": -2.3059184551239014, "logps/chosen": -2.8555402755737305, "logps/rejected": -3.0499281883239746, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.8259276747703552, "rewards/margins": -0.005549490451812744, "rewards/rejected": 0.831477165222168, "step": 1245 }, { "epoch": 0.67, "learning_rate": 9.509166825715928e-08, "logits/chosen": -2.119112253189087, "logits/rejected": -2.31172251701355, "logps/chosen": -13.622446060180664, "logps/rejected": -14.406984329223633, "loss": 0.6251, "rewards/accuracies": 1.0, "rewards/chosen": 0.8577249646186829, "rewards/margins": 0.14108049869537354, "rewards/rejected": 0.7166444659233093, "step": 1246 }, { "epoch": 0.67, "learning_rate": 9.508222817249553e-08, "logits/chosen": -1.9795103073120117, "logits/rejected": -2.2638797760009766, "logps/chosen": -0.7323304414749146, "logps/rejected": -0.6920445561408997, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.7909196019172668, "rewards/margins": 0.018144547939300537, "rewards/rejected": 0.7727750539779663, "step": 1247 }, { "epoch": 0.67, "learning_rate": 9.507277948809831e-08, "logits/chosen": -2.072455406188965, "logits/rejected": -2.073547840118408, "logps/chosen": -4.002212047576904, "logps/rejected": -0.3937394618988037, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.2993236780166626, "rewards/margins": 0.5200650095939636, "rewards/rejected": 0.779258668422699, "step": 1248 }, { "epoch": 0.67, "learning_rate": 9.506332220577003e-08, "logits/chosen": -2.0922951698303223, "logits/rejected": -2.1199533939361572, "logps/chosen": -3.462049961090088, "logps/rejected": -5.972071647644043, "loss": 0.5464, "rewards/accuracies": 1.0, "rewards/chosen": 1.0440717935562134, "rewards/margins": 0.3189118504524231, "rewards/rejected": 0.7251599431037903, "step": 1249 }, { "epoch": 0.67, "learning_rate": 9.505385632731475e-08, "logits/chosen": -1.9660571813583374, "logits/rejected": -2.278445243835449, "logps/chosen": -0.8289838433265686, "logps/rejected": -0.8381490707397461, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.8473397493362427, "rewards/margins": 0.005476593971252441, "rewards/rejected": 0.8418631553649902, "step": 1250 }, { "epoch": 0.67, "learning_rate": 9.504438185453812e-08, "logits/chosen": -2.054871082305908, "logits/rejected": -2.0132312774658203, "logps/chosen": -34.92195510864258, "logps/rejected": -1.9735924005508423, "loss": 0.4904, "rewards/accuracies": 1.0, "rewards/chosen": 1.2641651630401611, "rewards/margins": 0.457231342792511, "rewards/rejected": 0.8069338202476501, "step": 1251 }, { "epoch": 0.68, "learning_rate": 9.503489878924748e-08, "logits/chosen": -1.9195184707641602, "logits/rejected": -2.2470202445983887, "logps/chosen": -3.0895748138427734, "logps/rejected": -6.233071804046631, "loss": 0.6469, "rewards/accuracies": 1.0, "rewards/chosen": 0.857435405254364, "rewards/margins": 0.0947989821434021, "rewards/rejected": 0.7626364231109619, "step": 1252 }, { "epoch": 0.68, "learning_rate": 9.502540713325177e-08, "logits/chosen": -2.2007782459259033, "logits/rejected": -2.2949323654174805, "logps/chosen": -18.576547622680664, "logps/rejected": -13.452717781066895, "loss": 0.7038, "rewards/accuracies": 0.0, "rewards/chosen": 0.49088650941848755, "rewards/margins": -0.0212632417678833, "rewards/rejected": 0.5121497511863708, "step": 1253 }, { "epoch": 0.68, "learning_rate": 9.50159068883616e-08, "logits/chosen": -2.0698723793029785, "logits/rejected": -2.328502655029297, "logps/chosen": -1.559075117111206, "logps/rejected": -1.6870381832122803, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 0.7961871027946472, "rewards/margins": 0.05520504713058472, "rewards/rejected": 0.7409820556640625, "step": 1254 }, { "epoch": 0.68, "learning_rate": 9.500639805638919e-08, "logits/chosen": -2.0566580295562744, "logits/rejected": -2.0556890964508057, "logps/chosen": -1.129387617111206, "logps/rejected": -1.7395703792572021, "loss": 0.5904, "rewards/accuracies": 1.0, "rewards/chosen": 1.00077223777771, "rewards/margins": 0.21737128496170044, "rewards/rejected": 0.7834009528160095, "step": 1255 }, { "epoch": 0.68, "learning_rate": 9.499688063914843e-08, "logits/chosen": -2.0890021324157715, "logits/rejected": -2.092280626296997, "logps/chosen": -0.856866717338562, "logps/rejected": -2.3116626739501953, "loss": 0.5527, "rewards/accuracies": 1.0, "rewards/chosen": 0.9231804013252258, "rewards/margins": 0.303865909576416, "rewards/rejected": 0.6193144917488098, "step": 1256 }, { "epoch": 0.68, "learning_rate": 9.498735463845485e-08, "logits/chosen": -2.0493576526641846, "logits/rejected": -2.3035850524902344, "logps/chosen": -6.281811714172363, "logps/rejected": -3.1941821575164795, "loss": 0.7935, "rewards/accuracies": 0.0, "rewards/chosen": 0.6350765228271484, "rewards/margins": -0.19162052869796753, "rewards/rejected": 0.826697051525116, "step": 1257 }, { "epoch": 0.68, "learning_rate": 9.497782005612553e-08, "logits/chosen": -2.1595041751861572, "logits/rejected": -2.0616233348846436, "logps/chosen": -47.999366760253906, "logps/rejected": -2.003617763519287, "loss": 0.3612, "rewards/accuracies": 1.0, "rewards/chosen": 1.4808571338653564, "rewards/margins": 0.8321799039840698, "rewards/rejected": 0.6486772298812866, "step": 1258 }, { "epoch": 0.68, "learning_rate": 9.496827689397931e-08, "logits/chosen": -2.006969928741455, "logits/rejected": -2.011861562728882, "logps/chosen": -1.607051134109497, "logps/rejected": -2.5128469467163086, "loss": 0.5156, "rewards/accuracies": 1.0, "rewards/chosen": 1.0609302520751953, "rewards/margins": 0.39358454942703247, "rewards/rejected": 0.6673457026481628, "step": 1259 }, { "epoch": 0.68, "learning_rate": 9.495872515383661e-08, "logits/chosen": -1.9743800163269043, "logits/rejected": -1.975740671157837, "logps/chosen": -1.2076302766799927, "logps/rejected": -2.6989967823028564, "loss": 0.5599, "rewards/accuracies": 1.0, "rewards/chosen": 0.9228202700614929, "rewards/margins": 0.2871217727661133, "rewards/rejected": 0.6356984972953796, "step": 1260 }, { "epoch": 0.68, "learning_rate": 9.494916483751946e-08, "logits/chosen": -2.063478946685791, "logits/rejected": -2.059473991394043, "logps/chosen": -4.6606621742248535, "logps/rejected": -3.204148054122925, "loss": 0.582, "rewards/accuracies": 1.0, "rewards/chosen": 0.9290773272514343, "rewards/margins": 0.2362687587738037, "rewards/rejected": 0.6928085684776306, "step": 1261 }, { "epoch": 0.68, "learning_rate": 9.493959594685157e-08, "logits/chosen": -2.055814027786255, "logits/rejected": -2.047090530395508, "logps/chosen": -0.5922131538391113, "logps/rejected": -5.255156517028809, "loss": 0.518, "rewards/accuracies": 1.0, "rewards/chosen": 0.8998140692710876, "rewards/margins": 0.3876722455024719, "rewards/rejected": 0.5121418237686157, "step": 1262 }, { "epoch": 0.68, "learning_rate": 9.493001848365826e-08, "logits/chosen": -2.0619099140167236, "logits/rejected": -2.255112886428833, "logps/chosen": -0.5202378034591675, "logps/rejected": -0.6418343186378479, "loss": 0.699, "rewards/accuracies": 0.0, "rewards/chosen": 0.8506051301956177, "rewards/margins": -0.011736273765563965, "rewards/rejected": 0.8623414039611816, "step": 1263 }, { "epoch": 0.68, "learning_rate": 9.492043244976653e-08, "logits/chosen": -2.048625946044922, "logits/rejected": -2.049867868423462, "logps/chosen": -0.7968226671218872, "logps/rejected": -2.52077054977417, "loss": 0.5594, "rewards/accuracies": 1.0, "rewards/chosen": 1.024687647819519, "rewards/margins": 0.2881094217300415, "rewards/rejected": 0.7365782260894775, "step": 1264 }, { "epoch": 0.68, "learning_rate": 9.491083784700492e-08, "logits/chosen": -1.9790363311767578, "logits/rejected": -2.2823705673217773, "logps/chosen": -1.4926121234893799, "logps/rejected": -4.860263824462891, "loss": 0.608, "rewards/accuracies": 1.0, "rewards/chosen": 0.9725407958030701, "rewards/margins": 0.17830848693847656, "rewards/rejected": 0.7942323088645935, "step": 1265 }, { "epoch": 0.68, "learning_rate": 9.49012346772037e-08, "logits/chosen": -2.095982551574707, "logits/rejected": -2.09812068939209, "logps/chosen": -1.9876412153244019, "logps/rejected": -2.112096071243286, "loss": 0.5365, "rewards/accuracies": 1.0, "rewards/chosen": 1.013835072517395, "rewards/margins": 0.3425310254096985, "rewards/rejected": 0.6713040471076965, "step": 1266 }, { "epoch": 0.68, "learning_rate": 9.489162294219475e-08, "logits/chosen": -1.909746766090393, "logits/rejected": -2.2452213764190674, "logps/chosen": -0.5358057022094727, "logps/rejected": -0.5671930313110352, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.9337652325630188, "rewards/margins": 0.01205146312713623, "rewards/rejected": 0.9217137694358826, "step": 1267 }, { "epoch": 0.68, "learning_rate": 9.488200264381154e-08, "logits/chosen": -1.9522451162338257, "logits/rejected": -2.246593952178955, "logps/chosen": -1.1635602712631226, "logps/rejected": -2.705231189727783, "loss": 0.617, "rewards/accuracies": 1.0, "rewards/chosen": 0.9881674647331238, "rewards/margins": 0.15855413675308228, "rewards/rejected": 0.8296133279800415, "step": 1268 }, { "epoch": 0.68, "learning_rate": 9.487237378388923e-08, "logits/chosen": -2.0306432247161865, "logits/rejected": -2.0256118774414062, "logps/chosen": -10.145868301391602, "logps/rejected": -1.6163856983184814, "loss": 0.4157, "rewards/accuracies": 1.0, "rewards/chosen": 1.3684942722320557, "rewards/margins": 0.6627969145774841, "rewards/rejected": 0.7056973576545715, "step": 1269 }, { "epoch": 0.69, "learning_rate": 9.486273636426458e-08, "logits/chosen": -2.0949912071228027, "logits/rejected": -2.0939300060272217, "logps/chosen": -2.5068273544311523, "logps/rejected": -2.357809066772461, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 1.024103045463562, "rewards/margins": 0.012489795684814453, "rewards/rejected": 1.0116132497787476, "step": 1270 }, { "epoch": 0.69, "learning_rate": 9.485309038677597e-08, "logits/chosen": -2.067797899246216, "logits/rejected": -2.2615880966186523, "logps/chosen": -1.870429515838623, "logps/rejected": -1.3119992017745972, "loss": 0.711, "rewards/accuracies": 0.0, "rewards/chosen": 0.5105589628219604, "rewards/margins": -0.03537207841873169, "rewards/rejected": 0.5459310412406921, "step": 1271 }, { "epoch": 0.69, "learning_rate": 9.484343585326348e-08, "logits/chosen": -2.0534095764160156, "logits/rejected": -2.3180363178253174, "logps/chosen": -0.43835631012916565, "logps/rejected": -0.4530491828918457, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.827390193939209, "rewards/margins": 0.022219717502593994, "rewards/rejected": 0.805170476436615, "step": 1272 }, { "epoch": 0.69, "learning_rate": 9.483377276556874e-08, "logits/chosen": -2.0376250743865967, "logits/rejected": -2.219839572906494, "logps/chosen": -0.5237851142883301, "logps/rejected": -0.6525989770889282, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.7698057889938354, "rewards/margins": -0.014941990375518799, "rewards/rejected": 0.7847477793693542, "step": 1273 }, { "epoch": 0.69, "learning_rate": 9.482410112553507e-08, "logits/chosen": -2.091277837753296, "logits/rejected": -2.2253196239471436, "logps/chosen": -2.6497645378112793, "logps/rejected": -0.8521628975868225, "loss": 0.7019, "rewards/accuracies": 0.0, "rewards/chosen": 0.8205286264419556, "rewards/margins": -0.01737910509109497, "rewards/rejected": 0.8379077315330505, "step": 1274 }, { "epoch": 0.69, "learning_rate": 9.481442093500738e-08, "logits/chosen": -2.019576072692871, "logits/rejected": -2.015291213989258, "logps/chosen": -5.466073036193848, "logps/rejected": -2.514739990234375, "loss": 0.4098, "rewards/accuracies": 1.0, "rewards/chosen": 1.4319261312484741, "rewards/margins": 0.6801173090934753, "rewards/rejected": 0.7518088221549988, "step": 1275 }, { "epoch": 0.69, "learning_rate": 9.480473219583225e-08, "logits/chosen": -2.0640721321105957, "logits/rejected": -2.063220500946045, "logps/chosen": -1.0436856746673584, "logps/rejected": -2.945772647857666, "loss": 0.5798, "rewards/accuracies": 1.0, "rewards/chosen": 0.8527266383171082, "rewards/margins": 0.24114924669265747, "rewards/rejected": 0.6115773916244507, "step": 1276 }, { "epoch": 0.69, "learning_rate": 9.479503490985786e-08, "logits/chosen": -2.1372766494750977, "logits/rejected": -2.1194448471069336, "logps/chosen": -17.33637809753418, "logps/rejected": -2.8877267837524414, "loss": 0.4349, "rewards/accuracies": 1.0, "rewards/chosen": 1.1344965696334839, "rewards/margins": 0.607292890548706, "rewards/rejected": 0.5272036790847778, "step": 1277 }, { "epoch": 0.69, "learning_rate": 9.478532907893402e-08, "logits/chosen": -1.9843956232070923, "logits/rejected": -2.2647597789764404, "logps/chosen": -6.642147541046143, "logps/rejected": -6.3277459144592285, "loss": 0.7086, "rewards/accuracies": 0.0, "rewards/chosen": 0.5094308257102966, "rewards/margins": -0.030640840530395508, "rewards/rejected": 0.5400716662406921, "step": 1278 }, { "epoch": 0.69, "learning_rate": 9.47756147049122e-08, "logits/chosen": -2.0615756511688232, "logits/rejected": -2.2345335483551025, "logps/chosen": -0.8704825639724731, "logps/rejected": -0.8435212969779968, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.8088347315788269, "rewards/margins": 0.03434532880783081, "rewards/rejected": 0.7744894027709961, "step": 1279 }, { "epoch": 0.69, "learning_rate": 9.476589178964546e-08, "logits/chosen": -2.053907632827759, "logits/rejected": -2.0823113918304443, "logps/chosen": -3.5789506435394287, "logps/rejected": -7.968429088592529, "loss": 0.366, "rewards/accuracies": 1.0, "rewards/chosen": 1.5232843160629272, "rewards/margins": 0.8165926933288574, "rewards/rejected": 0.7066916227340698, "step": 1280 }, { "epoch": 0.69, "learning_rate": 9.475616033498855e-08, "logits/chosen": -1.975942850112915, "logits/rejected": -1.971419334411621, "logps/chosen": -9.501778602600098, "logps/rejected": -3.521265983581543, "loss": 0.5348, "rewards/accuracies": 1.0, "rewards/chosen": 1.3805407285690308, "rewards/margins": 0.3464750051498413, "rewards/rejected": 1.0340657234191895, "step": 1281 }, { "epoch": 0.69, "learning_rate": 9.474642034279778e-08, "logits/chosen": -2.026874542236328, "logits/rejected": -2.2203805446624756, "logps/chosen": -0.4692680537700653, "logps/rejected": -0.5573816299438477, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.7554079294204712, "rewards/margins": 0.012558579444885254, "rewards/rejected": 0.7428493499755859, "step": 1282 }, { "epoch": 0.69, "learning_rate": 9.473667181493111e-08, "logits/chosen": -2.002725601196289, "logits/rejected": -2.301820993423462, "logps/chosen": -0.710938572883606, "logps/rejected": -0.6995298862457275, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.9916803240776062, "rewards/margins": 0.014688491821289062, "rewards/rejected": 0.9769918322563171, "step": 1283 }, { "epoch": 0.69, "learning_rate": 9.472691475324814e-08, "logits/chosen": -2.004603147506714, "logits/rejected": -2.016911745071411, "logps/chosen": -2.119520664215088, "logps/rejected": -6.90715217590332, "loss": 0.4579, "rewards/accuracies": 1.0, "rewards/chosen": 1.0634769201278687, "rewards/margins": 0.5433165431022644, "rewards/rejected": 0.5201603770256042, "step": 1284 }, { "epoch": 0.69, "learning_rate": 9.471714915961014e-08, "logits/chosen": -2.1351239681243896, "logits/rejected": -2.138432502746582, "logps/chosen": -1.9745500087738037, "logps/rejected": -2.933068037033081, "loss": 0.5455, "rewards/accuracies": 1.0, "rewards/chosen": 1.025173544883728, "rewards/margins": 0.32090747356414795, "rewards/rejected": 0.7042660713195801, "step": 1285 }, { "epoch": 0.69, "learning_rate": 9.470737503587989e-08, "logits/chosen": -2.0051817893981934, "logits/rejected": -1.999804973602295, "logps/chosen": -3.143937110900879, "logps/rejected": -3.4373738765716553, "loss": 0.4329, "rewards/accuracies": 1.0, "rewards/chosen": 1.2141273021697998, "rewards/margins": 0.6129899621009827, "rewards/rejected": 0.6011373400688171, "step": 1286 }, { "epoch": 0.69, "learning_rate": 9.469759238392191e-08, "logits/chosen": -1.9739251136779785, "logits/rejected": -1.9770928621292114, "logps/chosen": -1.18471360206604, "logps/rejected": -3.3382742404937744, "loss": 0.5288, "rewards/accuracies": 1.0, "rewards/chosen": 1.0441635847091675, "rewards/margins": 0.3611827492713928, "rewards/rejected": 0.6829808354377747, "step": 1287 }, { "epoch": 0.69, "learning_rate": 9.46878012056023e-08, "logits/chosen": -1.9923967123031616, "logits/rejected": -1.962692379951477, "logps/chosen": -18.671262741088867, "logps/rejected": -1.7543182373046875, "loss": 0.5301, "rewards/accuracies": 1.0, "rewards/chosen": 1.219167947769165, "rewards/margins": 0.3580728769302368, "rewards/rejected": 0.8610950708389282, "step": 1288 }, { "epoch": 0.7, "learning_rate": 9.467800150278879e-08, "logits/chosen": -2.1913673877716064, "logits/rejected": -2.156161069869995, "logps/chosen": -32.768211364746094, "logps/rejected": -2.220423698425293, "loss": 0.5273, "rewards/accuracies": 1.0, "rewards/chosen": 1.363668441772461, "rewards/margins": 0.36478108167648315, "rewards/rejected": 0.9988873600959778, "step": 1289 }, { "epoch": 0.7, "learning_rate": 9.466819327735074e-08, "logits/chosen": -2.0833799839019775, "logits/rejected": -2.273418664932251, "logps/chosen": -0.919975996017456, "logps/rejected": -0.8585304021835327, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.9882646799087524, "rewards/margins": 0.04249328374862671, "rewards/rejected": 0.9457713961601257, "step": 1290 }, { "epoch": 0.7, "learning_rate": 9.465837653115914e-08, "logits/chosen": -2.069653034210205, "logits/rejected": -2.079042434692383, "logps/chosen": -5.313079833984375, "logps/rejected": -2.1594786643981934, "loss": 0.3923, "rewards/accuracies": 1.0, "rewards/chosen": 1.3247188329696655, "rewards/margins": 0.7330319285392761, "rewards/rejected": 0.5916869044303894, "step": 1291 }, { "epoch": 0.7, "learning_rate": 9.46485512660866e-08, "logits/chosen": -2.117032289505005, "logits/rejected": -2.12019681930542, "logps/chosen": -3.6028382778167725, "logps/rejected": -4.947194576263428, "loss": 0.5385, "rewards/accuracies": 1.0, "rewards/chosen": 0.991290271282196, "rewards/margins": 0.33764147758483887, "rewards/rejected": 0.6536487936973572, "step": 1292 }, { "epoch": 0.7, "learning_rate": 9.463871748400734e-08, "logits/chosen": -2.0226686000823975, "logits/rejected": -2.0219340324401855, "logps/chosen": -0.52327960729599, "logps/rejected": -2.988004446029663, "loss": 0.5369, "rewards/accuracies": 1.0, "rewards/chosen": 1.0498522520065308, "rewards/margins": 0.34161990880966187, "rewards/rejected": 0.7082323431968689, "step": 1293 }, { "epoch": 0.7, "learning_rate": 9.462887518679721e-08, "logits/chosen": -2.214531660079956, "logits/rejected": -2.260047674179077, "logps/chosen": -11.666690826416016, "logps/rejected": -8.607807159423828, "loss": 0.7102, "rewards/accuracies": 0.0, "rewards/chosen": 0.7482025027275085, "rewards/margins": -0.03386044502258301, "rewards/rejected": 0.7820629477500916, "step": 1294 }, { "epoch": 0.7, "learning_rate": 9.461902437633372e-08, "logits/chosen": -2.0826587677001953, "logits/rejected": -2.081526756286621, "logps/chosen": -0.5297015309333801, "logps/rejected": -6.608668327331543, "loss": 0.4015, "rewards/accuracies": 1.0, "rewards/chosen": 0.9634880423545837, "rewards/margins": 0.7052183151245117, "rewards/rejected": 0.25826969742774963, "step": 1295 }, { "epoch": 0.7, "learning_rate": 9.460916505449597e-08, "logits/chosen": -2.020129680633545, "logits/rejected": -2.020050048828125, "logps/chosen": -3.662792205810547, "logps/rejected": -4.766458988189697, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 0.9612997174263, "rewards/margins": 0.051827073097229004, "rewards/rejected": 0.909472644329071, "step": 1296 }, { "epoch": 0.7, "learning_rate": 9.459929722316469e-08, "logits/chosen": -1.9877889156341553, "logits/rejected": -2.0685460567474365, "logps/chosen": -17.669139862060547, "logps/rejected": -17.959938049316406, "loss": 0.5881, "rewards/accuracies": 1.0, "rewards/chosen": 1.120936632156372, "rewards/margins": 0.22246956825256348, "rewards/rejected": 0.8984670639038086, "step": 1297 }, { "epoch": 0.7, "learning_rate": 9.458942088422221e-08, "logits/chosen": -2.145622730255127, "logits/rejected": -2.144035577774048, "logps/chosen": -0.5719636678695679, "logps/rejected": -3.5528085231781006, "loss": 0.5373, "rewards/accuracies": 1.0, "rewards/chosen": 0.8493375778198242, "rewards/margins": 0.3404389023780823, "rewards/rejected": 0.5088986754417419, "step": 1298 }, { "epoch": 0.7, "learning_rate": 9.457953603955255e-08, "logits/chosen": -1.9577080011367798, "logits/rejected": -2.2569236755371094, "logps/chosen": -2.7515828609466553, "logps/rejected": -2.8029444217681885, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.9918146133422852, "rewards/margins": -0.0024968981742858887, "rewards/rejected": 0.994311511516571, "step": 1299 }, { "epoch": 0.7, "learning_rate": 9.456964269104129e-08, "logits/chosen": -2.0362648963928223, "logits/rejected": -2.0254743099212646, "logps/chosen": -7.7428765296936035, "logps/rejected": -2.1295666694641113, "loss": 0.4922, "rewards/accuracies": 1.0, "rewards/chosen": 1.3334680795669556, "rewards/margins": 0.452619731426239, "rewards/rejected": 0.8808483481407166, "step": 1300 }, { "epoch": 0.7, "learning_rate": 9.455974084057564e-08, "logits/chosen": -2.1645689010620117, "logits/rejected": -2.288266658782959, "logps/chosen": -0.44410577416419983, "logps/rejected": -0.4625498950481415, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.8643143773078918, "rewards/margins": 0.00047528743743896484, "rewards/rejected": 0.8638390898704529, "step": 1301 }, { "epoch": 0.7, "learning_rate": 9.454983049004445e-08, "logits/chosen": -2.022336959838867, "logits/rejected": -2.296337366104126, "logps/chosen": -0.8030881881713867, "logps/rejected": -0.7516981363296509, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 1.0764862298965454, "rewards/margins": 0.011513948440551758, "rewards/rejected": 1.0649722814559937, "step": 1302 }, { "epoch": 0.7, "learning_rate": 9.453991164133819e-08, "logits/chosen": -2.138049602508545, "logits/rejected": -2.3387155532836914, "logps/chosen": -5.787050724029541, "logps/rejected": -0.8808778524398804, "loss": 0.7826, "rewards/accuracies": 0.0, "rewards/chosen": 0.8385829925537109, "rewards/margins": -0.17147362232208252, "rewards/rejected": 1.0100566148757935, "step": 1303 }, { "epoch": 0.7, "learning_rate": 9.452998429634895e-08, "logits/chosen": -2.1311123371124268, "logits/rejected": -2.236358165740967, "logps/chosen": -0.8901896476745605, "logps/rejected": -3.5489702224731445, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": 1.1001970767974854, "rewards/margins": 0.14640003442764282, "rewards/rejected": 0.9537970423698425, "step": 1304 }, { "epoch": 0.7, "learning_rate": 9.452004845697044e-08, "logits/chosen": -1.975464940071106, "logits/rejected": -2.266197919845581, "logps/chosen": -3.1338610649108887, "logps/rejected": -10.74537181854248, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.8168544769287109, "rewards/margins": 0.0018042325973510742, "rewards/rejected": 0.8150502443313599, "step": 1305 }, { "epoch": 0.7, "learning_rate": 9.451010412509796e-08, "logits/chosen": -2.1963388919830322, "logits/rejected": -2.0807502269744873, "logps/chosen": -55.18296813964844, "logps/rejected": -9.110276222229004, "loss": 0.5826, "rewards/accuracies": 1.0, "rewards/chosen": 1.2752838134765625, "rewards/margins": 0.23482918739318848, "rewards/rejected": 1.040454626083374, "step": 1306 }, { "epoch": 0.7, "learning_rate": 9.450015130262847e-08, "logits/chosen": -2.0097246170043945, "logits/rejected": -2.009706735610962, "logps/chosen": -0.3885231912136078, "logps/rejected": -3.4421792030334473, "loss": 0.5352, "rewards/accuracies": 1.0, "rewards/chosen": 0.9495197534561157, "rewards/margins": 0.3456939458847046, "rewards/rejected": 0.6038258075714111, "step": 1307 }, { "epoch": 0.71, "learning_rate": 9.449018999146057e-08, "logits/chosen": -2.108663320541382, "logits/rejected": -2.3278615474700928, "logps/chosen": -1.7686980962753296, "logps/rejected": -1.2823803424835205, "loss": 0.6982, "rewards/accuracies": 0.0, "rewards/chosen": 0.9421221017837524, "rewards/margins": -0.010062038898468018, "rewards/rejected": 0.9521841406822205, "step": 1308 }, { "epoch": 0.71, "learning_rate": 9.44802201934944e-08, "logits/chosen": -2.0664303302764893, "logits/rejected": -2.2963383197784424, "logps/chosen": -2.272603988647461, "logps/rejected": -2.291774272918701, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.7899131178855896, "rewards/margins": 0.025857150554656982, "rewards/rejected": 0.7640559673309326, "step": 1309 }, { "epoch": 0.71, "learning_rate": 9.447024191063178e-08, "logits/chosen": -2.0719997882843018, "logits/rejected": -2.0776784420013428, "logps/chosen": -2.0975399017333984, "logps/rejected": -2.059619426727295, "loss": 0.5383, "rewards/accuracies": 1.0, "rewards/chosen": 1.019492268562317, "rewards/margins": 0.3381419777870178, "rewards/rejected": 0.6813502907752991, "step": 1310 }, { "epoch": 0.71, "learning_rate": 9.446025514477614e-08, "logits/chosen": -1.9836615324020386, "logits/rejected": -2.267443895339966, "logps/chosen": -0.6676737666130066, "logps/rejected": -0.8434571027755737, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.8885936141014099, "rewards/margins": -0.005932331085205078, "rewards/rejected": 0.894525945186615, "step": 1311 }, { "epoch": 0.71, "learning_rate": 9.445025989783253e-08, "logits/chosen": -2.1126086711883545, "logits/rejected": -2.112276554107666, "logps/chosen": -1.3005106449127197, "logps/rejected": -1.5926318168640137, "loss": 0.717, "rewards/accuracies": 0.0, "rewards/chosen": 0.8525139689445496, "rewards/margins": -0.04717361927032471, "rewards/rejected": 0.8996875882148743, "step": 1312 }, { "epoch": 0.71, "learning_rate": 9.444025617170759e-08, "logits/chosen": -2.142022132873535, "logits/rejected": -2.0736396312713623, "logps/chosen": -33.78297805786133, "logps/rejected": -3.3995916843414307, "loss": 0.4683, "rewards/accuracies": 1.0, "rewards/chosen": 1.16681706905365, "rewards/margins": 0.5152612328529358, "rewards/rejected": 0.6515558362007141, "step": 1313 }, { "epoch": 0.71, "learning_rate": 9.44302439683096e-08, "logits/chosen": -2.1018054485321045, "logits/rejected": -2.098806619644165, "logps/chosen": -6.021929740905762, "logps/rejected": -2.8993442058563232, "loss": 0.4816, "rewards/accuracies": 1.0, "rewards/chosen": 1.366953730583191, "rewards/margins": 0.4802119731903076, "rewards/rejected": 0.8867417573928833, "step": 1314 }, { "epoch": 0.71, "learning_rate": 9.442022328954847e-08, "logits/chosen": -2.07625150680542, "logits/rejected": -2.088526964187622, "logps/chosen": -3.144866943359375, "logps/rejected": -4.51451301574707, "loss": 0.7157, "rewards/accuracies": 0.0, "rewards/chosen": 0.9645435214042664, "rewards/margins": -0.044688403606414795, "rewards/rejected": 1.0092319250106812, "step": 1315 }, { "epoch": 0.71, "learning_rate": 9.441019413733568e-08, "logits/chosen": -2.1360301971435547, "logits/rejected": -2.1426689624786377, "logps/chosen": -4.170695781707764, "logps/rejected": -7.624820232391357, "loss": 0.4147, "rewards/accuracies": 1.0, "rewards/chosen": 1.2110767364501953, "rewards/margins": 0.6658307313919067, "rewards/rejected": 0.5452460050582886, "step": 1316 }, { "epoch": 0.71, "learning_rate": 9.44001565135844e-08, "logits/chosen": -2.015723705291748, "logits/rejected": -2.013814687728882, "logps/chosen": -0.5391695499420166, "logps/rejected": -2.9377753734588623, "loss": 0.5343, "rewards/accuracies": 1.0, "rewards/chosen": 0.9152164459228516, "rewards/margins": 0.3478628993034363, "rewards/rejected": 0.5673535466194153, "step": 1317 }, { "epoch": 0.71, "learning_rate": 9.439011042020933e-08, "logits/chosen": -2.098997116088867, "logits/rejected": -2.100163698196411, "logps/chosen": -1.648669958114624, "logps/rejected": -1.5061694383621216, "loss": 0.5637, "rewards/accuracies": 1.0, "rewards/chosen": 1.0251199007034302, "rewards/margins": 0.2782783508300781, "rewards/rejected": 0.746841549873352, "step": 1318 }, { "epoch": 0.71, "learning_rate": 9.438005585912685e-08, "logits/chosen": -2.1150996685028076, "logits/rejected": -2.284196615219116, "logps/chosen": -4.2120537757873535, "logps/rejected": -5.18597936630249, "loss": 0.7916, "rewards/accuracies": 0.0, "rewards/chosen": 0.7947244048118591, "rewards/margins": -0.1881081461906433, "rewards/rejected": 0.9828325510025024, "step": 1319 }, { "epoch": 0.71, "learning_rate": 9.436999283225495e-08, "logits/chosen": -2.1885509490966797, "logits/rejected": -2.3174991607666016, "logps/chosen": -13.22068977355957, "logps/rejected": -8.420219421386719, "loss": 0.8118, "rewards/accuracies": 0.0, "rewards/chosen": 0.7113010287284851, "rewards/margins": -0.22471928596496582, "rewards/rejected": 0.9360203146934509, "step": 1320 }, { "epoch": 0.71, "learning_rate": 9.435992134151319e-08, "logits/chosen": -2.164947986602783, "logits/rejected": -2.0750763416290283, "logps/chosen": -34.96221160888672, "logps/rejected": -2.5621793270111084, "loss": 0.396, "rewards/accuracies": 1.0, "rewards/chosen": 1.28509521484375, "rewards/margins": 0.7219400405883789, "rewards/rejected": 0.5631551742553711, "step": 1321 }, { "epoch": 0.71, "learning_rate": 9.434984138882279e-08, "logits/chosen": -1.979704737663269, "logits/rejected": -2.2709648609161377, "logps/chosen": -0.46286463737487793, "logps/rejected": -0.516771137714386, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8353061079978943, "rewards/margins": 0.0016440749168395996, "rewards/rejected": 0.8336620330810547, "step": 1322 }, { "epoch": 0.71, "learning_rate": 9.433975297610655e-08, "logits/chosen": -2.1363484859466553, "logits/rejected": -2.1435840129852295, "logps/chosen": -7.1687493324279785, "logps/rejected": -3.3051376342773438, "loss": 0.7758, "rewards/accuracies": 0.0, "rewards/chosen": 1.0254980325698853, "rewards/margins": -0.1589374542236328, "rewards/rejected": 1.184435486793518, "step": 1323 }, { "epoch": 0.71, "learning_rate": 9.432965610528893e-08, "logits/chosen": -1.9645137786865234, "logits/rejected": -2.0267534255981445, "logps/chosen": -5.866459846496582, "logps/rejected": -23.028596878051758, "loss": 0.3105, "rewards/accuracies": 1.0, "rewards/chosen": 1.3937709331512451, "rewards/margins": 1.0104886293411255, "rewards/rejected": 0.38328227400779724, "step": 1324 }, { "epoch": 0.71, "learning_rate": 9.431955077829595e-08, "logits/chosen": -2.0936989784240723, "logits/rejected": -2.282892942428589, "logps/chosen": -14.95772933959961, "logps/rejected": -10.122628211975098, "loss": 0.5893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9795168042182922, "rewards/margins": 0.21969521045684814, "rewards/rejected": 0.7598215937614441, "step": 1325 }, { "epoch": 0.72, "learning_rate": 9.430943699705529e-08, "logits/chosen": -1.9473121166229248, "logits/rejected": -2.2773077487945557, "logps/chosen": -1.5528326034545898, "logps/rejected": -1.511380910873413, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": 0.8942510485649109, "rewards/margins": -0.009213447570800781, "rewards/rejected": 0.9034644961357117, "step": 1326 }, { "epoch": 0.72, "learning_rate": 9.429931476349619e-08, "logits/chosen": -2.044687032699585, "logits/rejected": -2.0467324256896973, "logps/chosen": -1.758810043334961, "logps/rejected": -4.140186309814453, "loss": 0.4943, "rewards/accuracies": 1.0, "rewards/chosen": 0.9700586199760437, "rewards/margins": 0.44738030433654785, "rewards/rejected": 0.5226783156394958, "step": 1327 }, { "epoch": 0.72, "learning_rate": 9.428918407954957e-08, "logits/chosen": -1.9979655742645264, "logits/rejected": -2.2494115829467773, "logps/chosen": -0.545889675617218, "logps/rejected": -0.5867931842803955, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9848260283470154, "rewards/margins": 0.011417210102081299, "rewards/rejected": 0.9734088182449341, "step": 1328 }, { "epoch": 0.72, "learning_rate": 9.42790449471479e-08, "logits/chosen": -2.0039942264556885, "logits/rejected": -2.0013680458068848, "logps/chosen": -9.798588752746582, "logps/rejected": -1.6621167659759521, "loss": 0.4627, "rewards/accuracies": 1.0, "rewards/chosen": 1.3337615728378296, "rewards/margins": 0.530297040939331, "rewards/rejected": 0.8034645318984985, "step": 1329 }, { "epoch": 0.72, "learning_rate": 9.426889736822529e-08, "logits/chosen": -2.062575101852417, "logits/rejected": -2.2442920207977295, "logps/chosen": -2.7726712226867676, "logps/rejected": -2.882885456085205, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.9302482604980469, "rewards/margins": 0.007014095783233643, "rewards/rejected": 0.9232341647148132, "step": 1330 }, { "epoch": 0.72, "learning_rate": 9.425874134471747e-08, "logits/chosen": -2.129849433898926, "logits/rejected": -2.267979383468628, "logps/chosen": -2.7788093090057373, "logps/rejected": -8.416797637939453, "loss": 0.6263, "rewards/accuracies": 1.0, "rewards/chosen": 1.1054561138153076, "rewards/margins": 0.13857263326644897, "rewards/rejected": 0.9668834805488586, "step": 1331 }, { "epoch": 0.72, "learning_rate": 9.424857687856176e-08, "logits/chosen": -1.9926183223724365, "logits/rejected": -2.252659320831299, "logps/chosen": -0.6896620988845825, "logps/rejected": -0.6826232075691223, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.9839785695075989, "rewards/margins": 0.0007281303405761719, "rewards/rejected": 0.9832504391670227, "step": 1332 }, { "epoch": 0.72, "learning_rate": 9.423840397169709e-08, "logits/chosen": -2.030447483062744, "logits/rejected": -2.2256228923797607, "logps/chosen": -2.4142160415649414, "logps/rejected": -0.8658164143562317, "loss": 0.6404, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280787944793701, "rewards/margins": 0.10846447944641113, "rewards/rejected": 0.919614315032959, "step": 1333 }, { "epoch": 0.72, "learning_rate": 9.422822262606401e-08, "logits/chosen": -2.077410936355591, "logits/rejected": -2.035116195678711, "logps/chosen": -12.241337776184082, "logps/rejected": -5.973908424377441, "loss": 0.51, "rewards/accuracies": 1.0, "rewards/chosen": 1.2117356061935425, "rewards/margins": 0.4074666500091553, "rewards/rejected": 0.8042689561843872, "step": 1334 }, { "epoch": 0.72, "learning_rate": 9.42180328436047e-08, "logits/chosen": -2.0927505493164062, "logits/rejected": -2.0944981575012207, "logps/chosen": -0.8125674724578857, "logps/rejected": -2.6179604530334473, "loss": 0.5887, "rewards/accuracies": 1.0, "rewards/chosen": 0.7454177141189575, "rewards/margins": 0.22105342149734497, "rewards/rejected": 0.5243642926216125, "step": 1335 }, { "epoch": 0.72, "learning_rate": 9.420783462626291e-08, "logits/chosen": -2.0044305324554443, "logits/rejected": -2.2799582481384277, "logps/chosen": -0.8845421671867371, "logps/rejected": -0.9212038516998291, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.9246803522109985, "rewards/margins": 0.017359375953674316, "rewards/rejected": 0.9073209762573242, "step": 1336 }, { "epoch": 0.72, "learning_rate": 9.419762797598403e-08, "logits/chosen": -1.9596772193908691, "logits/rejected": -1.9656567573547363, "logps/chosen": -2.1819422245025635, "logps/rejected": -3.390514373779297, "loss": 0.433, "rewards/accuracies": 1.0, "rewards/chosen": 1.1459354162216187, "rewards/margins": 0.61259526014328, "rewards/rejected": 0.5333401560783386, "step": 1337 }, { "epoch": 0.72, "learning_rate": 9.418741289471503e-08, "logits/chosen": -2.099787712097168, "logits/rejected": -2.264474391937256, "logps/chosen": -0.8984475135803223, "logps/rejected": -0.9543547630310059, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.8975610733032227, "rewards/margins": -0.008467674255371094, "rewards/rejected": 0.9060287475585938, "step": 1338 }, { "epoch": 0.72, "learning_rate": 9.417718938440452e-08, "logits/chosen": -2.0698485374450684, "logits/rejected": -2.0818331241607666, "logps/chosen": -7.578239917755127, "logps/rejected": -5.679530620574951, "loss": 0.3737, "rewards/accuracies": 1.0, "rewards/chosen": 1.6307522058486938, "rewards/margins": 0.7916328310966492, "rewards/rejected": 0.8391193747520447, "step": 1339 }, { "epoch": 0.72, "learning_rate": 9.416695744700269e-08, "logits/chosen": -1.9957464933395386, "logits/rejected": -1.9990990161895752, "logps/chosen": -5.432106971740723, "logps/rejected": -7.288813591003418, "loss": 0.395, "rewards/accuracies": 1.0, "rewards/chosen": 1.2095767259597778, "rewards/margins": 0.7248827219009399, "rewards/rejected": 0.4846940040588379, "step": 1340 }, { "epoch": 0.72, "learning_rate": 9.415671708446137e-08, "logits/chosen": -2.029538154602051, "logits/rejected": -2.0297651290893555, "logps/chosen": -2.028657913208008, "logps/rejected": -0.8774229288101196, "loss": 0.611, "rewards/accuracies": 1.0, "rewards/chosen": 0.8816079497337341, "rewards/margins": 0.17168664932250977, "rewards/rejected": 0.7099213004112244, "step": 1341 }, { "epoch": 0.72, "learning_rate": 9.414646829873395e-08, "logits/chosen": -2.027053117752075, "logits/rejected": -2.268411874771118, "logps/chosen": -0.48294031620025635, "logps/rejected": -0.5196148753166199, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8843888640403748, "rewards/margins": 0.020751237869262695, "rewards/rejected": 0.8636376261711121, "step": 1342 }, { "epoch": 0.72, "learning_rate": 9.413621109177548e-08, "logits/chosen": -2.178755760192871, "logits/rejected": -2.17364501953125, "logps/chosen": -7.113836765289307, "logps/rejected": -6.032157897949219, "loss": 0.4245, "rewards/accuracies": 1.0, "rewards/chosen": 1.1226627826690674, "rewards/margins": 0.6372072696685791, "rewards/rejected": 0.4854555130004883, "step": 1343 }, { "epoch": 0.72, "learning_rate": 9.412594546554256e-08, "logits/chosen": -2.0478711128234863, "logits/rejected": -2.048072099685669, "logps/chosen": -4.178986549377441, "logps/rejected": -3.5541698932647705, "loss": 0.3798, "rewards/accuracies": 1.0, "rewards/chosen": 1.412894368171692, "rewards/margins": 0.772101104259491, "rewards/rejected": 0.6407932639122009, "step": 1344 }, { "epoch": 0.73, "learning_rate": 9.411567142199345e-08, "logits/chosen": -2.026228427886963, "logits/rejected": -2.259253740310669, "logps/chosen": -3.719363212585449, "logps/rejected": -3.8607425689697266, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.7349972724914551, "rewards/margins": -0.003291904926300049, "rewards/rejected": 0.7382891774177551, "step": 1345 }, { "epoch": 0.73, "learning_rate": 9.410538896308797e-08, "logits/chosen": -2.0561516284942627, "logits/rejected": -2.289107322692871, "logps/chosen": -0.45208120346069336, "logps/rejected": -0.4461422860622406, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8532692193984985, "rewards/margins": 0.01807922124862671, "rewards/rejected": 0.8351899981498718, "step": 1346 }, { "epoch": 0.73, "learning_rate": 9.40950980907876e-08, "logits/chosen": -2.0578441619873047, "logits/rejected": -2.0630650520324707, "logps/chosen": -1.0669605731964111, "logps/rejected": -4.23838996887207, "loss": 0.4994, "rewards/accuracies": 1.0, "rewards/chosen": 0.9206894040107727, "rewards/margins": 0.4342842102050781, "rewards/rejected": 0.4864051938056946, "step": 1347 }, { "epoch": 0.73, "learning_rate": 9.408479880705538e-08, "logits/chosen": -2.1093931198120117, "logits/rejected": -2.094029426574707, "logps/chosen": -2.044712781906128, "logps/rejected": -9.097747802734375, "loss": 0.4448, "rewards/accuracies": 1.0, "rewards/chosen": 1.1650842428207397, "rewards/margins": 0.5795716643333435, "rewards/rejected": 0.5855125784873962, "step": 1348 }, { "epoch": 0.73, "learning_rate": 9.407449111385595e-08, "logits/chosen": -2.1150169372558594, "logits/rejected": -2.119579315185547, "logps/chosen": -4.156125545501709, "logps/rejected": -3.444439172744751, "loss": 0.4456, "rewards/accuracies": 1.0, "rewards/chosen": 1.126079797744751, "rewards/margins": 0.5774036049842834, "rewards/rejected": 0.5486761927604675, "step": 1349 }, { "epoch": 0.73, "learning_rate": 9.406417501315557e-08, "logits/chosen": -2.2032487392425537, "logits/rejected": -2.1990907192230225, "logps/chosen": -5.583958148956299, "logps/rejected": -2.4031198024749756, "loss": 0.4418, "rewards/accuracies": 1.0, "rewards/chosen": 1.200724482536316, "rewards/margins": 0.5878081917762756, "rewards/rejected": 0.6129162907600403, "step": 1350 }, { "epoch": 0.73, "learning_rate": 9.405385050692213e-08, "logits/chosen": -1.967651605606079, "logits/rejected": -2.240779161453247, "logps/chosen": -0.7388650178909302, "logps/rejected": -0.8282846212387085, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 1.0537786483764648, "rewards/margins": 0.012475132942199707, "rewards/rejected": 1.0413035154342651, "step": 1351 }, { "epoch": 0.73, "learning_rate": 9.404351759712507e-08, "logits/chosen": -2.0543696880340576, "logits/rejected": -2.298741340637207, "logps/chosen": -0.9608217477798462, "logps/rejected": -1.165632724761963, "loss": 0.6673, "rewards/accuracies": 1.0, "rewards/chosen": 0.7633351683616638, "rewards/margins": 0.05231529474258423, "rewards/rejected": 0.7110198736190796, "step": 1352 }, { "epoch": 0.73, "learning_rate": 9.403317628573548e-08, "logits/chosen": -2.0695149898529053, "logits/rejected": -2.313328981399536, "logps/chosen": -0.6357734799385071, "logps/rejected": -1.297006368637085, "loss": 0.6612, "rewards/accuracies": 1.0, "rewards/chosen": 0.9776404500007629, "rewards/margins": 0.06489914655685425, "rewards/rejected": 0.9127413034439087, "step": 1353 }, { "epoch": 0.73, "learning_rate": 9.402282657472603e-08, "logits/chosen": -2.1138529777526855, "logits/rejected": -2.267282247543335, "logps/chosen": -2.2891972064971924, "logps/rejected": -2.4981284141540527, "loss": 0.6675, "rewards/accuracies": 1.0, "rewards/chosen": 0.8266066908836365, "rewards/margins": 0.05198979377746582, "rewards/rejected": 0.7746168971061707, "step": 1354 }, { "epoch": 0.73, "learning_rate": 9.401246846607101e-08, "logits/chosen": -2.1305601596832275, "logits/rejected": -2.130295991897583, "logps/chosen": -5.427150726318359, "logps/rejected": -2.391139507293701, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": 1.5621103048324585, "rewards/margins": 1.0174696445465088, "rewards/rejected": 0.5446407198905945, "step": 1355 }, { "epoch": 0.73, "learning_rate": 9.400210196174625e-08, "logits/chosen": -2.0803470611572266, "logits/rejected": -2.0720131397247314, "logps/chosen": -2.740384101867676, "logps/rejected": -10.47925090789795, "loss": 0.3908, "rewards/accuracies": 1.0, "rewards/chosen": 1.2976040840148926, "rewards/margins": 0.7378085851669312, "rewards/rejected": 0.5597954988479614, "step": 1356 }, { "epoch": 0.73, "learning_rate": 9.399172706372929e-08, "logits/chosen": -1.9821525812149048, "logits/rejected": -1.9822126626968384, "logps/chosen": -0.48236632347106934, "logps/rejected": -2.475135564804077, "loss": 0.5442, "rewards/accuracies": 1.0, "rewards/chosen": 0.8798465728759766, "rewards/margins": 0.3239458203315735, "rewards/rejected": 0.5559007525444031, "step": 1357 }, { "epoch": 0.73, "learning_rate": 9.398134377399916e-08, "logits/chosen": -2.0492711067199707, "logits/rejected": -2.1126441955566406, "logps/chosen": -8.16572380065918, "logps/rejected": -16.98662567138672, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": 1.27786386013031, "rewards/margins": 0.6669034361839294, "rewards/rejected": 0.6109604239463806, "step": 1358 }, { "epoch": 0.73, "learning_rate": 9.397095209453657e-08, "logits/chosen": -2.0673859119415283, "logits/rejected": -2.0556414127349854, "logps/chosen": -0.6174017190933228, "logps/rejected": -5.341019153594971, "loss": 0.4989, "rewards/accuracies": 1.0, "rewards/chosen": 0.9894286394119263, "rewards/margins": 0.43559950590133667, "rewards/rejected": 0.5538291335105896, "step": 1359 }, { "epoch": 0.73, "learning_rate": 9.39605520273238e-08, "logits/chosen": -2.115781784057617, "logits/rejected": -2.3159782886505127, "logps/chosen": -0.9844118356704712, "logps/rejected": -0.9839159846305847, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 1.0105773210525513, "rewards/margins": 0.021891295909881592, "rewards/rejected": 0.9886860251426697, "step": 1360 }, { "epoch": 0.73, "learning_rate": 9.395014357434471e-08, "logits/chosen": -2.023883819580078, "logits/rejected": -2.074655771255493, "logps/chosen": -2.794220447540283, "logps/rejected": -23.18177032470703, "loss": 0.3719, "rewards/accuracies": 1.0, "rewards/chosen": 1.1070196628570557, "rewards/margins": 0.7972902655601501, "rewards/rejected": 0.3097293972969055, "step": 1361 }, { "epoch": 0.73, "learning_rate": 9.393972673758481e-08, "logits/chosen": -1.9286682605743408, "logits/rejected": -1.9366785287857056, "logps/chosen": -1.8435158729553223, "logps/rejected": -2.8901658058166504, "loss": 0.5314, "rewards/accuracies": 1.0, "rewards/chosen": 0.984698474407196, "rewards/margins": 0.3547102212905884, "rewards/rejected": 0.6299882531166077, "step": 1362 }, { "epoch": 0.74, "learning_rate": 9.392930151903115e-08, "logits/chosen": -2.0224833488464355, "logits/rejected": -2.2571072578430176, "logps/chosen": -0.7887319326400757, "logps/rejected": -0.7202466726303101, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7778395414352417, "rewards/margins": 0.01761525869369507, "rewards/rejected": 0.7602242827415466, "step": 1363 }, { "epoch": 0.74, "learning_rate": 9.391886792067245e-08, "logits/chosen": -2.0465292930603027, "logits/rejected": -2.0521769523620605, "logps/chosen": -1.1512260437011719, "logps/rejected": -19.890628814697266, "loss": 0.576, "rewards/accuracies": 1.0, "rewards/chosen": 1.0481799840927124, "rewards/margins": 0.2497691512107849, "rewards/rejected": 0.7984108328819275, "step": 1364 }, { "epoch": 0.74, "learning_rate": 9.390842594449897e-08, "logits/chosen": -1.9829246997833252, "logits/rejected": -2.2249817848205566, "logps/chosen": -0.5603294968605042, "logps/rejected": -0.6613384485244751, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.9867273569107056, "rewards/margins": 0.020911812782287598, "rewards/rejected": 0.965815544128418, "step": 1365 }, { "epoch": 0.74, "learning_rate": 9.389797559250256e-08, "logits/chosen": -2.0326027870178223, "logits/rejected": -2.059748888015747, "logps/chosen": -6.16701602935791, "logps/rejected": -4.685663223266602, "loss": 0.654, "rewards/accuracies": 1.0, "rewards/chosen": 0.974794864654541, "rewards/margins": 0.07993781566619873, "rewards/rejected": 0.8948570489883423, "step": 1366 }, { "epoch": 0.74, "learning_rate": 9.388751686667673e-08, "logits/chosen": -2.1248834133148193, "logits/rejected": -2.1326870918273926, "logps/chosen": -2.3967113494873047, "logps/rejected": -2.2527997493743896, "loss": 0.5712, "rewards/accuracies": 1.0, "rewards/chosen": 0.9396568536758423, "rewards/margins": 0.2608465552330017, "rewards/rejected": 0.6788102984428406, "step": 1367 }, { "epoch": 0.74, "learning_rate": 9.387704976901652e-08, "logits/chosen": -2.005876302719116, "logits/rejected": -2.270301580429077, "logps/chosen": -7.401854038238525, "logps/rejected": -6.694886684417725, "loss": 0.7149, "rewards/accuracies": 0.0, "rewards/chosen": 0.5107061266899109, "rewards/margins": -0.04312729835510254, "rewards/rejected": 0.5538334250450134, "step": 1368 }, { "epoch": 0.74, "learning_rate": 9.386657430151863e-08, "logits/chosen": -2.0336198806762695, "logits/rejected": -2.0197999477386475, "logps/chosen": -15.944343566894531, "logps/rejected": -4.4616594314575195, "loss": 0.6349, "rewards/accuracies": 1.0, "rewards/chosen": 1.187729835510254, "rewards/margins": 0.12016737461090088, "rewards/rejected": 1.067562460899353, "step": 1369 }, { "epoch": 0.74, "learning_rate": 9.38560904661813e-08, "logits/chosen": -1.977609634399414, "logits/rejected": -1.9745763540267944, "logps/chosen": -3.1353378295898438, "logps/rejected": -6.74873161315918, "loss": 0.5332, "rewards/accuracies": 1.0, "rewards/chosen": 0.9772025942802429, "rewards/margins": 0.3505077362060547, "rewards/rejected": 0.6266948580741882, "step": 1370 }, { "epoch": 0.74, "learning_rate": 9.38455982650044e-08, "logits/chosen": -2.148566961288452, "logits/rejected": -2.302968740463257, "logps/chosen": -2.228463649749756, "logps/rejected": -2.1785826683044434, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": 0.9568943381309509, "rewards/margins": -0.015677988529205322, "rewards/rejected": 0.9725723266601562, "step": 1371 }, { "epoch": 0.74, "learning_rate": 9.383509769998939e-08, "logits/chosen": -1.9732768535614014, "logits/rejected": -2.244572639465332, "logps/chosen": -1.5999712944030762, "logps/rejected": -1.5567904710769653, "loss": 0.6996, "rewards/accuracies": 0.0, "rewards/chosen": 0.7881497144699097, "rewards/margins": -0.012862622737884521, "rewards/rejected": 0.8010123372077942, "step": 1372 }, { "epoch": 0.74, "learning_rate": 9.382458877313932e-08, "logits/chosen": -2.070671319961548, "logits/rejected": -2.0718746185302734, "logps/chosen": -3.1945502758026123, "logps/rejected": -3.9227914810180664, "loss": 0.3293, "rewards/accuracies": 1.0, "rewards/chosen": 1.4721601009368896, "rewards/margins": 0.9417684078216553, "rewards/rejected": 0.5303916931152344, "step": 1373 }, { "epoch": 0.74, "learning_rate": 9.381407148645883e-08, "logits/chosen": -2.033717632293701, "logits/rejected": -2.034972667694092, "logps/chosen": -2.5601630210876465, "logps/rejected": -1.7695907354354858, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 1.1083269119262695, "rewards/margins": -0.0054111480712890625, "rewards/rejected": 1.1137380599975586, "step": 1374 }, { "epoch": 0.74, "learning_rate": 9.380354584195418e-08, "logits/chosen": -2.1948554515838623, "logits/rejected": -2.210609197616577, "logps/chosen": -5.294551849365234, "logps/rejected": -10.180624008178711, "loss": 0.5201, "rewards/accuracies": 1.0, "rewards/chosen": 1.1925960779190063, "rewards/margins": 0.38238638639450073, "rewards/rejected": 0.8102096915245056, "step": 1375 }, { "epoch": 0.74, "learning_rate": 9.379301184163319e-08, "logits/chosen": -1.9989073276519775, "logits/rejected": -2.080204963684082, "logps/chosen": -3.0133719444274902, "logps/rejected": -23.873666763305664, "loss": 0.5416, "rewards/accuracies": 1.0, "rewards/chosen": 0.7730494141578674, "rewards/margins": 0.33019644021987915, "rewards/rejected": 0.4428529739379883, "step": 1376 }, { "epoch": 0.74, "learning_rate": 9.37824694875053e-08, "logits/chosen": -1.9901477098464966, "logits/rejected": -1.9898219108581543, "logps/chosen": -1.2125722169876099, "logps/rejected": -1.2879608869552612, "loss": 0.6332, "rewards/accuracies": 1.0, "rewards/chosen": 0.9851910471916199, "rewards/margins": 0.12377476692199707, "rewards/rejected": 0.8614162802696228, "step": 1377 }, { "epoch": 0.74, "learning_rate": 9.377191878158153e-08, "logits/chosen": -2.0163047313690186, "logits/rejected": -2.0169501304626465, "logps/chosen": -1.9324049949645996, "logps/rejected": -1.541895866394043, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": 1.0249031782150269, "rewards/margins": 0.10524773597717285, "rewards/rejected": 0.919655442237854, "step": 1378 }, { "epoch": 0.74, "learning_rate": 9.376135972587449e-08, "logits/chosen": -1.9650628566741943, "logits/rejected": -2.255295753479004, "logps/chosen": -3.395099401473999, "logps/rejected": -6.971512317657471, "loss": 0.5619, "rewards/accuracies": 1.0, "rewards/chosen": 0.7318102121353149, "rewards/margins": 0.28225618600845337, "rewards/rejected": 0.4495540261268616, "step": 1379 }, { "epoch": 0.74, "learning_rate": 9.37507923223984e-08, "logits/chosen": -2.153149127960205, "logits/rejected": -2.149759531021118, "logps/chosen": -7.637890338897705, "logps/rejected": -4.242144584655762, "loss": 0.4173, "rewards/accuracies": 1.0, "rewards/chosen": 1.2044670581817627, "rewards/margins": 0.658108651638031, "rewards/rejected": 0.5463584065437317, "step": 1380 }, { "epoch": 0.74, "learning_rate": 9.374021657316904e-08, "logits/chosen": -2.066028594970703, "logits/rejected": -2.070775270462036, "logps/chosen": -1.7594715356826782, "logps/rejected": -2.5713024139404297, "loss": 0.4757, "rewards/accuracies": 1.0, "rewards/chosen": 1.1545847654342651, "rewards/margins": 0.49558281898498535, "rewards/rejected": 0.6590019464492798, "step": 1381 }, { "epoch": 0.75, "learning_rate": 9.372963248020384e-08, "logits/chosen": -2.1423635482788086, "logits/rejected": -2.27339243888855, "logps/chosen": -19.065595626831055, "logps/rejected": -12.742073059082031, "loss": 0.5781, "rewards/accuracies": 1.0, "rewards/chosen": 0.7239933013916016, "rewards/margins": 0.24515476822853088, "rewards/rejected": 0.4788385331630707, "step": 1382 }, { "epoch": 0.75, "learning_rate": 9.371904004552177e-08, "logits/chosen": -2.023902177810669, "logits/rejected": -2.285088300704956, "logps/chosen": -2.9775238037109375, "logps/rejected": -2.823817253112793, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.8549478650093079, "rewards/margins": 0.013390898704528809, "rewards/rejected": 0.841556966304779, "step": 1383 }, { "epoch": 0.75, "learning_rate": 9.37084392711434e-08, "logits/chosen": -2.2929179668426514, "logits/rejected": -2.2848899364471436, "logps/chosen": -1.229390263557434, "logps/rejected": -0.6658384799957275, "loss": 0.7099, "rewards/accuracies": 0.0, "rewards/chosen": 0.7210904359817505, "rewards/margins": -0.03314906358718872, "rewards/rejected": 0.7542394995689392, "step": 1384 }, { "epoch": 0.75, "learning_rate": 9.36978301590909e-08, "logits/chosen": -2.107621669769287, "logits/rejected": -2.1097471714019775, "logps/chosen": -3.9568800926208496, "logps/rejected": -9.626588821411133, "loss": 0.4416, "rewards/accuracies": 1.0, "rewards/chosen": 1.2250044345855713, "rewards/margins": 0.5883663296699524, "rewards/rejected": 0.6366381049156189, "step": 1385 }, { "epoch": 0.75, "learning_rate": 9.368721271138803e-08, "logits/chosen": -2.213505744934082, "logits/rejected": -2.2314023971557617, "logps/chosen": -3.9387600421905518, "logps/rejected": -7.658852577209473, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": 1.2667911052703857, "rewards/margins": 0.6766495108604431, "rewards/rejected": 0.5901415944099426, "step": 1386 }, { "epoch": 0.75, "learning_rate": 9.367658693006016e-08, "logits/chosen": -2.179328441619873, "logits/rejected": -2.178988218307495, "logps/chosen": -2.8604512214660645, "logps/rejected": -4.811487674713135, "loss": 0.493, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816938638687134, "rewards/margins": 0.45051759481430054, "rewards/rejected": 0.5311762690544128, "step": 1387 }, { "epoch": 0.75, "learning_rate": 9.366595281713419e-08, "logits/chosen": -2.0315730571746826, "logits/rejected": -2.298227310180664, "logps/chosen": -1.0973838567733765, "logps/rejected": -0.962292492389679, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.7965173125267029, "rewards/margins": 0.0074269771575927734, "rewards/rejected": 0.7890903353691101, "step": 1388 }, { "epoch": 0.75, "learning_rate": 9.365531037463865e-08, "logits/chosen": -1.9812341928482056, "logits/rejected": -1.9787291288375854, "logps/chosen": -3.9496119022369385, "logps/rejected": -2.9372360706329346, "loss": 0.394, "rewards/accuracies": 1.0, "rewards/chosen": 1.371850848197937, "rewards/margins": 0.7279204726219177, "rewards/rejected": 0.6439303755760193, "step": 1389 }, { "epoch": 0.75, "learning_rate": 9.364465960460371e-08, "logits/chosen": -2.1936044692993164, "logits/rejected": -2.1884658336639404, "logps/chosen": -6.63377571105957, "logps/rejected": -2.969261884689331, "loss": 0.5309, "rewards/accuracies": 1.0, "rewards/chosen": 0.9080657958984375, "rewards/margins": 0.35605818033218384, "rewards/rejected": 0.5520076155662537, "step": 1390 }, { "epoch": 0.75, "learning_rate": 9.363400050906101e-08, "logits/chosen": -1.9884482622146606, "logits/rejected": -1.9780550003051758, "logps/chosen": -6.423096656799316, "logps/rejected": -5.004953384399414, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 1.0359416007995605, "rewards/margins": 0.4706466794013977, "rewards/rejected": 0.5652949213981628, "step": 1391 }, { "epoch": 0.75, "learning_rate": 9.362333309004387e-08, "logits/chosen": -2.156454086303711, "logits/rejected": -2.2240774631500244, "logps/chosen": -2.6770713329315186, "logps/rejected": -2.764885425567627, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.9267348647117615, "rewards/margins": 0.019918739795684814, "rewards/rejected": 0.9068161249160767, "step": 1392 }, { "epoch": 0.75, "learning_rate": 9.361265734958717e-08, "logits/chosen": -2.164188861846924, "logits/rejected": -2.2355480194091797, "logps/chosen": -2.6817901134490967, "logps/rejected": -2.60487699508667, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.77777498960495, "rewards/margins": 0.004517018795013428, "rewards/rejected": 0.7732579708099365, "step": 1393 }, { "epoch": 0.75, "learning_rate": 9.360197328972738e-08, "logits/chosen": -2.1881895065307617, "logits/rejected": -2.0657551288604736, "logps/chosen": -38.346588134765625, "logps/rejected": -4.068365097045898, "loss": 0.3732, "rewards/accuracies": 1.0, "rewards/chosen": 1.4425948858261108, "rewards/margins": 0.7931498289108276, "rewards/rejected": 0.6494450569152832, "step": 1394 }, { "epoch": 0.75, "learning_rate": 9.359128091250256e-08, "logits/chosen": -2.1280598640441895, "logits/rejected": -2.2318127155303955, "logps/chosen": -1.5312864780426025, "logps/rejected": -1.3000659942626953, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 1.000139594078064, "rewards/margins": -0.012390613555908203, "rewards/rejected": 1.0125302076339722, "step": 1395 }, { "epoch": 0.75, "learning_rate": 9.358058021995235e-08, "logits/chosen": -1.990220308303833, "logits/rejected": -2.2405765056610107, "logps/chosen": -0.7302705645561218, "logps/rejected": -0.7154672145843506, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.9453837275505066, "rewards/margins": 0.01733386516571045, "rewards/rejected": 0.9280498623847961, "step": 1396 }, { "epoch": 0.75, "learning_rate": 9.356987121411797e-08, "logits/chosen": -2.0359373092651367, "logits/rejected": -2.0371317863464355, "logps/chosen": -1.5802724361419678, "logps/rejected": -1.7810989618301392, "loss": 0.5452, "rewards/accuracies": 1.0, "rewards/chosen": 1.034462332725525, "rewards/margins": 0.32158994674682617, "rewards/rejected": 0.7128723859786987, "step": 1397 }, { "epoch": 0.75, "learning_rate": 9.355915389704223e-08, "logits/chosen": -2.0885066986083984, "logits/rejected": -2.2656288146972656, "logps/chosen": -3.3942041397094727, "logps/rejected": -3.587679862976074, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 0.7432608604431152, "rewards/margins": -0.016472458839416504, "rewards/rejected": 0.7597333192825317, "step": 1398 }, { "epoch": 0.75, "learning_rate": 9.354842827076954e-08, "logits/chosen": -2.186250686645508, "logits/rejected": -2.1751747131347656, "logps/chosen": -12.420334815979004, "logps/rejected": -2.5289089679718018, "loss": 0.5523, "rewards/accuracies": 1.0, "rewards/chosen": 1.344929814338684, "rewards/margins": 0.30482017993927, "rewards/rejected": 1.040109634399414, "step": 1399 }, { "epoch": 0.76, "learning_rate": 9.353769433734588e-08, "logits/chosen": -2.06266713142395, "logits/rejected": -2.265130043029785, "logps/chosen": -0.576330304145813, "logps/rejected": -0.6280360221862793, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8383318781852722, "rewards/margins": 0.02253901958465576, "rewards/rejected": 0.8157928586006165, "step": 1400 }, { "epoch": 0.76, "learning_rate": 9.352695209881882e-08, "logits/chosen": -1.9915335178375244, "logits/rejected": -1.9909415245056152, "logps/chosen": -1.5462950468063354, "logps/rejected": -2.632007360458374, "loss": 0.523, "rewards/accuracies": 1.0, "rewards/chosen": 0.9523208737373352, "rewards/margins": 0.3753248453140259, "rewards/rejected": 0.5769960284233093, "step": 1401 }, { "epoch": 0.76, "learning_rate": 9.351620155723752e-08, "logits/chosen": -1.9817143678665161, "logits/rejected": -1.9899426698684692, "logps/chosen": -8.174271583557129, "logps/rejected": -1.9063918590545654, "loss": 0.7396, "rewards/accuracies": 0.0, "rewards/chosen": 1.0157405138015747, "rewards/margins": -0.09074807167053223, "rewards/rejected": 1.106488585472107, "step": 1402 }, { "epoch": 0.76, "learning_rate": 9.35054427146527e-08, "logits/chosen": -2.1164557933807373, "logits/rejected": -2.1129188537597656, "logps/chosen": -4.839179515838623, "logps/rejected": -7.690237522125244, "loss": 0.5953, "rewards/accuracies": 1.0, "rewards/chosen": 0.9053279757499695, "rewards/margins": 0.20624661445617676, "rewards/rejected": 0.6990813612937927, "step": 1403 }, { "epoch": 0.76, "learning_rate": 9.349467557311668e-08, "logits/chosen": -2.070932626724243, "logits/rejected": -2.2544593811035156, "logps/chosen": -0.5829511284828186, "logps/rejected": -0.5909342169761658, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.890932559967041, "rewards/margins": 0.019511699676513672, "rewards/rejected": 0.8714208602905273, "step": 1404 }, { "epoch": 0.76, "learning_rate": 9.348390013468339e-08, "logits/chosen": -2.096374273300171, "logits/rejected": -2.2651724815368652, "logps/chosen": -0.8130921125411987, "logps/rejected": -0.762890100479126, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8156350255012512, "rewards/margins": 0.015052437782287598, "rewards/rejected": 0.8005825877189636, "step": 1405 }, { "epoch": 0.76, "learning_rate": 9.34731164014083e-08, "logits/chosen": -2.0575406551361084, "logits/rejected": -2.052786350250244, "logps/chosen": -3.775580883026123, "logps/rejected": -3.2350590229034424, "loss": 0.5704, "rewards/accuracies": 1.0, "rewards/chosen": 1.0697691440582275, "rewards/margins": 0.26270824670791626, "rewards/rejected": 0.8070608973503113, "step": 1406 }, { "epoch": 0.76, "learning_rate": 9.346232437534847e-08, "logits/chosen": -2.0988433361053467, "logits/rejected": -2.071751594543457, "logps/chosen": -31.409093856811523, "logps/rejected": -17.945350646972656, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 1.3578294515609741, "rewards/margins": 0.5835395455360413, "rewards/rejected": 0.7742899060249329, "step": 1407 }, { "epoch": 0.76, "learning_rate": 9.345152405856256e-08, "logits/chosen": -2.1786396503448486, "logits/rejected": -2.0527944564819336, "logps/chosen": -52.450462341308594, "logps/rejected": -3.102576494216919, "loss": 0.2839, "rewards/accuracies": 1.0, "rewards/chosen": 1.8236702680587769, "rewards/margins": 1.1140060424804688, "rewards/rejected": 0.7096642851829529, "step": 1408 }, { "epoch": 0.76, "learning_rate": 9.344071545311081e-08, "logits/chosen": -2.0808303356170654, "logits/rejected": -2.0835680961608887, "logps/chosen": -1.9844155311584473, "logps/rejected": -3.2093143463134766, "loss": 0.4962, "rewards/accuracies": 1.0, "rewards/chosen": 1.0554230213165283, "rewards/margins": 0.4424756169319153, "rewards/rejected": 0.612947404384613, "step": 1409 }, { "epoch": 0.76, "learning_rate": 9.342989856105501e-08, "logits/chosen": -2.117663860321045, "logits/rejected": -2.1173009872436523, "logps/chosen": -0.4072197675704956, "logps/rejected": -3.487722396850586, "loss": 0.507, "rewards/accuracies": 1.0, "rewards/chosen": 0.9860376715660095, "rewards/margins": 0.4151228070259094, "rewards/rejected": 0.5709148645401001, "step": 1410 }, { "epoch": 0.76, "learning_rate": 9.341907338445858e-08, "logits/chosen": -2.2741641998291016, "logits/rejected": -2.333717107772827, "logps/chosen": -0.5671360492706299, "logps/rejected": -0.5813902020454407, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9372380375862122, "rewards/margins": 0.005042374134063721, "rewards/rejected": 0.9321956634521484, "step": 1411 }, { "epoch": 0.76, "learning_rate": 9.340823992538646e-08, "logits/chosen": -2.2164433002471924, "logits/rejected": -2.1098923683166504, "logps/chosen": -43.95806884765625, "logps/rejected": -3.9808878898620605, "loss": 0.3799, "rewards/accuracies": 1.0, "rewards/chosen": 1.6234005689620972, "rewards/margins": 0.7718397974967957, "rewards/rejected": 0.8515607714653015, "step": 1412 }, { "epoch": 0.76, "learning_rate": 9.339739818590526e-08, "logits/chosen": -2.0586471557617188, "logits/rejected": -2.05806303024292, "logps/chosen": -2.3196394443511963, "logps/rejected": -4.0165276527404785, "loss": 0.5318, "rewards/accuracies": 1.0, "rewards/chosen": 0.9023992419242859, "rewards/margins": 0.35393887758255005, "rewards/rejected": 0.5484603643417358, "step": 1413 }, { "epoch": 0.76, "learning_rate": 9.338654816808305e-08, "logits/chosen": -2.1029303073883057, "logits/rejected": -2.252352237701416, "logps/chosen": -2.8385491371154785, "logps/rejected": -4.350284576416016, "loss": 0.5937, "rewards/accuracies": 1.0, "rewards/chosen": 0.887485921382904, "rewards/margins": 0.20996171236038208, "rewards/rejected": 0.677524209022522, "step": 1414 }, { "epoch": 0.76, "learning_rate": 9.337568987398958e-08, "logits/chosen": -1.9893735647201538, "logits/rejected": -2.2584824562072754, "logps/chosen": -1.669180154800415, "logps/rejected": -1.5917686223983765, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.9546502232551575, "rewards/margins": 0.00798189640045166, "rewards/rejected": 0.9466683268547058, "step": 1415 }, { "epoch": 0.76, "learning_rate": 9.336482330569615e-08, "logits/chosen": -2.1234214305877686, "logits/rejected": -2.2527124881744385, "logps/chosen": -0.6984007954597473, "logps/rejected": -0.7513445019721985, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.8770946860313416, "rewards/margins": 0.009071648120880127, "rewards/rejected": 0.8680230379104614, "step": 1416 }, { "epoch": 0.76, "learning_rate": 9.335394846527558e-08, "logits/chosen": -2.0630898475646973, "logits/rejected": -2.2836074829101562, "logps/chosen": -1.5705525875091553, "logps/rejected": -1.5684391260147095, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.7449397444725037, "rewards/margins": 0.0058405399322509766, "rewards/rejected": 0.7390992045402527, "step": 1417 }, { "epoch": 0.76, "learning_rate": 9.334306535480237e-08, "logits/chosen": -2.1329591274261475, "logits/rejected": -2.0264666080474854, "logps/chosen": -11.252440452575684, "logps/rejected": -2.4985148906707764, "loss": 0.3716, "rewards/accuracies": 1.0, "rewards/chosen": 1.5625861883163452, "rewards/margins": 0.7985491156578064, "rewards/rejected": 0.7640370726585388, "step": 1418 }, { "epoch": 0.77, "learning_rate": 9.333217397635253e-08, "logits/chosen": -2.125859498977661, "logits/rejected": -2.1333062648773193, "logps/chosen": -2.1991164684295654, "logps/rejected": -3.382538318634033, "loss": 0.4366, "rewards/accuracies": 1.0, "rewards/chosen": 1.2953944206237793, "rewards/margins": 0.6024949550628662, "rewards/rejected": 0.6928994655609131, "step": 1419 }, { "epoch": 0.77, "learning_rate": 9.332127433200365e-08, "logits/chosen": -1.986470103263855, "logits/rejected": -2.248018503189087, "logps/chosen": -5.313808441162109, "logps/rejected": -5.282994747161865, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.7630109786987305, "rewards/margins": 0.014860928058624268, "rewards/rejected": 0.7481500506401062, "step": 1420 }, { "epoch": 0.77, "learning_rate": 9.331036642383492e-08, "logits/chosen": -2.0808849334716797, "logits/rejected": -2.2144832611083984, "logps/chosen": -1.5524723529815674, "logps/rejected": -1.5567318201065063, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9112681746482849, "rewards/margins": 0.0188523530960083, "rewards/rejected": 0.8924158215522766, "step": 1421 }, { "epoch": 0.77, "learning_rate": 9.32994502539271e-08, "logits/chosen": -1.9997483491897583, "logits/rejected": -2.0123114585876465, "logps/chosen": -3.5651118755340576, "logps/rejected": -1.8041572570800781, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": 0.8665904402732849, "rewards/margins": 0.24835187196731567, "rewards/rejected": 0.6182385683059692, "step": 1422 }, { "epoch": 0.77, "learning_rate": 9.32885258243625e-08, "logits/chosen": -1.9774953126907349, "logits/rejected": -1.919216275215149, "logps/chosen": -21.245431900024414, "logps/rejected": -1.6521306037902832, "loss": 0.5365, "rewards/accuracies": 1.0, "rewards/chosen": 1.2348226308822632, "rewards/margins": 0.342490553855896, "rewards/rejected": 0.8923320770263672, "step": 1423 }, { "epoch": 0.77, "learning_rate": 9.327759313722506e-08, "logits/chosen": -2.2124812602996826, "logits/rejected": -2.3810477256774902, "logps/chosen": -11.160690307617188, "logps/rejected": -11.288671493530273, "loss": 0.6689, "rewards/accuracies": 1.0, "rewards/chosen": 1.020293116569519, "rewards/margins": 0.04916024208068848, "rewards/rejected": 0.9711328744888306, "step": 1424 }, { "epoch": 0.77, "learning_rate": 9.326665219460024e-08, "logits/chosen": -2.023313522338867, "logits/rejected": -2.0128982067108154, "logps/chosen": -5.700179576873779, "logps/rejected": -1.9129149913787842, "loss": 0.4344, "rewards/accuracies": 1.0, "rewards/chosen": 1.4949588775634766, "rewards/margins": 0.6086258888244629, "rewards/rejected": 0.8863329887390137, "step": 1425 }, { "epoch": 0.77, "learning_rate": 9.32557029985751e-08, "logits/chosen": -2.139754295349121, "logits/rejected": -2.326465368270874, "logps/chosen": -0.7470479607582092, "logps/rejected": -18.121360778808594, "loss": 0.6282, "rewards/accuracies": 1.0, "rewards/chosen": 0.8084967732429504, "rewards/margins": 0.13447314500808716, "rewards/rejected": 0.6740236282348633, "step": 1426 }, { "epoch": 0.77, "learning_rate": 9.324474555123827e-08, "logits/chosen": -2.033985137939453, "logits/rejected": -2.0408990383148193, "logps/chosen": -0.8547291159629822, "logps/rejected": -5.848030090332031, "loss": 0.466, "rewards/accuracies": 1.0, "rewards/chosen": 0.9828649759292603, "rewards/margins": 0.5214203596115112, "rewards/rejected": 0.46144458651542664, "step": 1427 }, { "epoch": 0.77, "learning_rate": 9.323377985467997e-08, "logits/chosen": -2.0056190490722656, "logits/rejected": -2.30501651763916, "logps/chosen": -0.4628027081489563, "logps/rejected": -0.5697361826896667, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.9631000757217407, "rewards/margins": 0.01474916934967041, "rewards/rejected": 0.9483509063720703, "step": 1428 }, { "epoch": 0.77, "learning_rate": 9.322280591099195e-08, "logits/chosen": -1.9792990684509277, "logits/rejected": -1.979452133178711, "logps/chosen": -0.5148351788520813, "logps/rejected": -6.177272796630859, "loss": 0.4357, "rewards/accuracies": 1.0, "rewards/chosen": 1.030609369277954, "rewards/margins": 0.6051883101463318, "rewards/rejected": 0.4254210591316223, "step": 1429 }, { "epoch": 0.77, "learning_rate": 9.32118237222676e-08, "logits/chosen": -2.113396167755127, "logits/rejected": -2.117903709411621, "logps/chosen": -4.731260776519775, "logps/rejected": -0.508823573589325, "loss": 0.5835, "rewards/accuracies": 1.0, "rewards/chosen": 1.195060133934021, "rewards/margins": 0.2327318787574768, "rewards/rejected": 0.9623282551765442, "step": 1430 }, { "epoch": 0.77, "learning_rate": 9.320083329060182e-08, "logits/chosen": -2.0856175422668457, "logits/rejected": -2.099789619445801, "logps/chosen": -4.956752300262451, "logps/rejected": -3.4305145740509033, "loss": 0.5593, "rewards/accuracies": 1.0, "rewards/chosen": 1.1027072668075562, "rewards/margins": 0.28835034370422363, "rewards/rejected": 0.8143569231033325, "step": 1431 }, { "epoch": 0.77, "learning_rate": 9.318983461809111e-08, "logits/chosen": -1.9989537000656128, "logits/rejected": -1.9722278118133545, "logps/chosen": -10.233575820922852, "logps/rejected": -5.341067790985107, "loss": 0.4833, "rewards/accuracies": 1.0, "rewards/chosen": 1.2364082336425781, "rewards/margins": 0.47569113969802856, "rewards/rejected": 0.7607170939445496, "step": 1432 }, { "epoch": 0.77, "learning_rate": 9.317882770683355e-08, "logits/chosen": -2.0426559448242188, "logits/rejected": -2.2960002422332764, "logps/chosen": -0.8727224469184875, "logps/rejected": -0.8009325861930847, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.8914818167686462, "rewards/margins": 0.018497884273529053, "rewards/rejected": 0.8729839324951172, "step": 1433 }, { "epoch": 0.77, "learning_rate": 9.316781255892878e-08, "logits/chosen": -2.0334575176239014, "logits/rejected": -2.1426291465759277, "logps/chosen": -0.7976726293563843, "logps/rejected": -21.621355056762695, "loss": 0.3503, "rewards/accuracies": 1.0, "rewards/chosen": 0.9452304840087891, "rewards/margins": 0.868674635887146, "rewards/rejected": 0.07655582576990128, "step": 1434 }, { "epoch": 0.77, "learning_rate": 9.3156789176478e-08, "logits/chosen": -2.3124914169311523, "logits/rejected": -2.1991264820098877, "logps/chosen": -37.17141342163086, "logps/rejected": -2.7335853576660156, "loss": 0.4146, "rewards/accuracies": 1.0, "rewards/chosen": 1.435935616493225, "rewards/margins": 0.6659409403800964, "rewards/rejected": 0.7699946761131287, "step": 1435 }, { "epoch": 0.77, "learning_rate": 9.314575756158399e-08, "logits/chosen": -2.1469082832336426, "logits/rejected": -2.1476426124572754, "logps/chosen": -6.884248733520508, "logps/rejected": -10.021377563476562, "loss": 0.2591, "rewards/accuracies": 1.0, "rewards/chosen": 1.660884141921997, "rewards/margins": 1.218003749847412, "rewards/rejected": 0.44288045167922974, "step": 1436 }, { "epoch": 0.78, "learning_rate": 9.313471771635113e-08, "logits/chosen": -2.1022558212280273, "logits/rejected": -2.234470844268799, "logps/chosen": -3.6069939136505127, "logps/rejected": -4.403571128845215, "loss": 0.7029, "rewards/accuracies": 0.0, "rewards/chosen": 0.7366677522659302, "rewards/margins": -0.019335567951202393, "rewards/rejected": 0.7560033202171326, "step": 1437 }, { "epoch": 0.78, "learning_rate": 9.312366964288531e-08, "logits/chosen": -2.007734537124634, "logits/rejected": -2.2557995319366455, "logps/chosen": -1.7443073987960815, "logps/rejected": -1.888124704360962, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.8582523465156555, "rewards/margins": 0.0030983686447143555, "rewards/rejected": 0.8551539778709412, "step": 1438 }, { "epoch": 0.78, "learning_rate": 9.311261334329406e-08, "logits/chosen": -2.049901247024536, "logits/rejected": -2.3314027786254883, "logps/chosen": -1.8594683408737183, "logps/rejected": -19.14546012878418, "loss": 0.3501, "rewards/accuracies": 1.0, "rewards/chosen": 0.9699410796165466, "rewards/margins": 0.8694339394569397, "rewards/rejected": 0.10050716251134872, "step": 1439 }, { "epoch": 0.78, "learning_rate": 9.310154881968642e-08, "logits/chosen": -1.972937822341919, "logits/rejected": -1.9764455556869507, "logps/chosen": -2.453197956085205, "logps/rejected": -3.32645845413208, "loss": 0.5608, "rewards/accuracies": 1.0, "rewards/chosen": 0.9688861966133118, "rewards/margins": 0.2848092317581177, "rewards/rejected": 0.6840769648551941, "step": 1440 }, { "epoch": 0.78, "learning_rate": 9.309047607417302e-08, "logits/chosen": -2.0215959548950195, "logits/rejected": -2.24564790725708, "logps/chosen": -1.3881165981292725, "logps/rejected": -1.3727308511734009, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.9349666833877563, "rewards/margins": 0.015305042266845703, "rewards/rejected": 0.9196616411209106, "step": 1441 }, { "epoch": 0.78, "learning_rate": 9.307939510886606e-08, "logits/chosen": -2.0646305084228516, "logits/rejected": -2.0558273792266846, "logps/chosen": -9.437042236328125, "logps/rejected": -5.764470100402832, "loss": 0.3363, "rewards/accuracies": 1.0, "rewards/chosen": 1.429051399230957, "rewards/margins": 0.9168640971183777, "rewards/rejected": 0.5121873021125793, "step": 1442 }, { "epoch": 0.78, "learning_rate": 9.30683059258793e-08, "logits/chosen": -2.122253894805908, "logits/rejected": -2.2724971771240234, "logps/chosen": -0.6803411245346069, "logps/rejected": -0.7651664614677429, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9630729556083679, "rewards/margins": 0.005784571170806885, "rewards/rejected": 0.957288384437561, "step": 1443 }, { "epoch": 0.78, "learning_rate": 9.30572085273281e-08, "logits/chosen": -2.022433042526245, "logits/rejected": -2.219708204269409, "logps/chosen": -7.317874908447266, "logps/rejected": -4.263574123382568, "loss": 0.7056, "rewards/accuracies": 0.0, "rewards/chosen": 0.7921760678291321, "rewards/margins": -0.024808108806610107, "rewards/rejected": 0.8169841766357422, "step": 1444 }, { "epoch": 0.78, "learning_rate": 9.304610291532934e-08, "logits/chosen": -1.9683587551116943, "logits/rejected": -2.2719407081604004, "logps/chosen": -0.8448074460029602, "logps/rejected": -0.83127760887146, "loss": 0.7046, "rewards/accuracies": 0.0, "rewards/chosen": 0.7648651003837585, "rewards/margins": -0.02270364761352539, "rewards/rejected": 0.7875687479972839, "step": 1445 }, { "epoch": 0.78, "learning_rate": 9.303498909200151e-08, "logits/chosen": -2.074368953704834, "logits/rejected": -2.0819945335388184, "logps/chosen": -2.779400587081909, "logps/rejected": -4.61452054977417, "loss": 0.4234, "rewards/accuracies": 1.0, "rewards/chosen": 1.1188781261444092, "rewards/margins": 0.6402806043624878, "rewards/rejected": 0.478597491979599, "step": 1446 }, { "epoch": 0.78, "learning_rate": 9.302386705946463e-08, "logits/chosen": -2.182574510574341, "logits/rejected": -2.2826883792877197, "logps/chosen": -5.153425216674805, "logps/rejected": -1.6457761526107788, "loss": 0.7653, "rewards/accuracies": 0.0, "rewards/chosen": 1.103771448135376, "rewards/margins": -0.13949978351593018, "rewards/rejected": 1.2432712316513062, "step": 1447 }, { "epoch": 0.78, "learning_rate": 9.301273681984031e-08, "logits/chosen": -2.0110583305358887, "logits/rejected": -2.010103225708008, "logps/chosen": -2.9719655513763428, "logps/rejected": -4.158902168273926, "loss": 0.5219, "rewards/accuracies": 1.0, "rewards/chosen": 1.057152271270752, "rewards/margins": 0.37799185514450073, "rewards/rejected": 0.6791604161262512, "step": 1448 }, { "epoch": 0.78, "learning_rate": 9.30015983752517e-08, "logits/chosen": -2.100051164627075, "logits/rejected": -2.0977425575256348, "logps/chosen": -8.073503494262695, "logps/rejected": -3.7962255477905273, "loss": 0.5384, "rewards/accuracies": 1.0, "rewards/chosen": 1.372305154800415, "rewards/margins": 0.33789849281311035, "rewards/rejected": 1.0344066619873047, "step": 1449 }, { "epoch": 0.78, "learning_rate": 9.299045172782355e-08, "logits/chosen": -2.0287892818450928, "logits/rejected": -2.248612403869629, "logps/chosen": -0.47674429416656494, "logps/rejected": -0.4773588180541992, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.8750165104866028, "rewards/margins": 0.015554726123809814, "rewards/rejected": 0.859461784362793, "step": 1450 }, { "epoch": 0.78, "learning_rate": 9.297929687968214e-08, "logits/chosen": -1.9270597696304321, "logits/rejected": -2.246797561645508, "logps/chosen": -1.021058201789856, "logps/rejected": -0.9853876829147339, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.8327422142028809, "rewards/margins": 0.03417634963989258, "rewards/rejected": 0.7985658645629883, "step": 1451 }, { "epoch": 0.78, "learning_rate": 9.296813383295535e-08, "logits/chosen": -2.0266575813293457, "logits/rejected": -2.2068254947662354, "logps/chosen": -0.8793777227401733, "logps/rejected": -1.029618263244629, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.7817111611366272, "rewards/margins": -0.0010274648666381836, "rewards/rejected": 0.7827386260032654, "step": 1452 }, { "epoch": 0.78, "learning_rate": 9.295696258977259e-08, "logits/chosen": -2.101916551589966, "logits/rejected": -2.0910816192626953, "logps/chosen": -7.192819118499756, "logps/rejected": -3.6506869792938232, "loss": 0.4084, "rewards/accuracies": 1.0, "rewards/chosen": 1.2244691848754883, "rewards/margins": 0.6842793226242065, "rewards/rejected": 0.5401898622512817, "step": 1453 }, { "epoch": 0.78, "learning_rate": 9.294578315226486e-08, "logits/chosen": -1.9830107688903809, "logits/rejected": -2.2133750915527344, "logps/chosen": -0.4756425619125366, "logps/rejected": -0.5040258765220642, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.7718626856803894, "rewards/margins": 0.005852639675140381, "rewards/rejected": 0.766010046005249, "step": 1454 }, { "epoch": 0.78, "learning_rate": 9.293459552256471e-08, "logits/chosen": -2.0879786014556885, "logits/rejected": -2.2853481769561768, "logps/chosen": -2.362837076187134, "logps/rejected": -7.03841495513916, "loss": 0.5664, "rewards/accuracies": 1.0, "rewards/chosen": 0.7028722763061523, "rewards/margins": 0.2718503773212433, "rewards/rejected": 0.43102189898490906, "step": 1455 }, { "epoch": 0.79, "learning_rate": 9.292339970280622e-08, "logits/chosen": -2.043905019760132, "logits/rejected": -2.0562946796417236, "logps/chosen": -6.543699264526367, "logps/rejected": -9.93358325958252, "loss": 0.293, "rewards/accuracies": 1.0, "rewards/chosen": 1.592317819595337, "rewards/margins": 1.077364444732666, "rewards/rejected": 0.5149533152580261, "step": 1456 }, { "epoch": 0.79, "learning_rate": 9.291219569512514e-08, "logits/chosen": -2.083425998687744, "logits/rejected": -2.078401565551758, "logps/chosen": -0.8781045079231262, "logps/rejected": -3.64201283454895, "loss": 0.5974, "rewards/accuracies": 1.0, "rewards/chosen": 0.9153448939323425, "rewards/margins": 0.2017073631286621, "rewards/rejected": 0.7136375308036804, "step": 1457 }, { "epoch": 0.79, "learning_rate": 9.290098350165865e-08, "logits/chosen": -2.0808143615722656, "logits/rejected": -2.0867860317230225, "logps/chosen": -0.7393831014633179, "logps/rejected": -13.630882263183594, "loss": 0.4494, "rewards/accuracies": 1.0, "rewards/chosen": 0.9266976714134216, "rewards/margins": 0.5667550563812256, "rewards/rejected": 0.35994264483451843, "step": 1458 }, { "epoch": 0.79, "learning_rate": 9.288976312454555e-08, "logits/chosen": -2.2304587364196777, "logits/rejected": -2.156984567642212, "logps/chosen": -28.695396423339844, "logps/rejected": -4.115640163421631, "loss": 0.3141, "rewards/accuracies": 1.0, "rewards/chosen": 1.540537714958191, "rewards/margins": 0.9967657327651978, "rewards/rejected": 0.5437719821929932, "step": 1459 }, { "epoch": 0.79, "learning_rate": 9.287853456592624e-08, "logits/chosen": -2.085341215133667, "logits/rejected": -2.223104476928711, "logps/chosen": -1.946197271347046, "logps/rejected": -1.5339312553405762, "loss": 0.7236, "rewards/accuracies": 0.0, "rewards/chosen": 0.8646190762519836, "rewards/margins": -0.05992048978805542, "rewards/rejected": 0.9245395660400391, "step": 1460 }, { "epoch": 0.79, "learning_rate": 9.28672978279426e-08, "logits/chosen": -2.1363675594329834, "logits/rejected": -2.1392922401428223, "logps/chosen": -2.836745262145996, "logps/rejected": -3.390807867050171, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": 1.2441487312316895, "rewards/margins": 0.6928836703300476, "rewards/rejected": 0.5512650609016418, "step": 1461 }, { "epoch": 0.79, "learning_rate": 9.285605291273815e-08, "logits/chosen": -2.0279743671417236, "logits/rejected": -2.2893824577331543, "logps/chosen": -4.074265480041504, "logps/rejected": -2.751936912536621, "loss": 0.7098, "rewards/accuracies": 0.0, "rewards/chosen": 0.8170933723449707, "rewards/margins": -0.03306913375854492, "rewards/rejected": 0.8501625061035156, "step": 1462 }, { "epoch": 0.79, "learning_rate": 9.28447998224579e-08, "logits/chosen": -2.0084621906280518, "logits/rejected": -2.012699604034424, "logps/chosen": -1.675146222114563, "logps/rejected": -3.6664726734161377, "loss": 0.4525, "rewards/accuracies": 1.0, "rewards/chosen": 0.9117458462715149, "rewards/margins": 0.5581488609313965, "rewards/rejected": 0.353596955537796, "step": 1463 }, { "epoch": 0.79, "learning_rate": 9.283353855924848e-08, "logits/chosen": -2.007214307785034, "logits/rejected": -2.011908769607544, "logps/chosen": -3.7636001110076904, "logps/rejected": -3.2754862308502197, "loss": 0.437, "rewards/accuracies": 1.0, "rewards/chosen": 1.2738155126571655, "rewards/margins": 0.6013447642326355, "rewards/rejected": 0.67247074842453, "step": 1464 }, { "epoch": 0.79, "learning_rate": 9.282226912525801e-08, "logits/chosen": -2.1077656745910645, "logits/rejected": -2.1009016036987305, "logps/chosen": -5.302872657775879, "logps/rejected": -3.45857310295105, "loss": 0.3472, "rewards/accuracies": 1.0, "rewards/chosen": 1.3670560121536255, "rewards/margins": 0.8791536092758179, "rewards/rejected": 0.48790237307548523, "step": 1465 }, { "epoch": 0.79, "learning_rate": 9.281099152263625e-08, "logits/chosen": -2.1376678943634033, "logits/rejected": -2.33371901512146, "logps/chosen": -2.4654533863067627, "logps/rejected": -0.529621958732605, "loss": 0.7665, "rewards/accuracies": 0.0, "rewards/chosen": 0.8524935841560364, "rewards/margins": -0.14161920547485352, "rewards/rejected": 0.9941127896308899, "step": 1466 }, { "epoch": 0.79, "learning_rate": 9.279970575353446e-08, "logits/chosen": -2.0336811542510986, "logits/rejected": -2.2556405067443848, "logps/chosen": -1.3506147861480713, "logps/rejected": -1.3970487117767334, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9356082081794739, "rewards/margins": 0.01619654893875122, "rewards/rejected": 0.9194116592407227, "step": 1467 }, { "epoch": 0.79, "learning_rate": 9.278841182010548e-08, "logits/chosen": -2.088047981262207, "logits/rejected": -2.090031623840332, "logps/chosen": -3.468130588531494, "logps/rejected": -0.9387859106063843, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": 0.7311747074127197, "rewards/margins": 0.044741272926330566, "rewards/rejected": 0.6864334344863892, "step": 1468 }, { "epoch": 0.79, "learning_rate": 9.27771097245037e-08, "logits/chosen": -1.9992650747299194, "logits/rejected": -1.9943625926971436, "logps/chosen": -5.931213855743408, "logps/rejected": -1.678393840789795, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 1.4889373779296875, "rewards/margins": 0.8636645674705505, "rewards/rejected": 0.625272810459137, "step": 1469 }, { "epoch": 0.79, "learning_rate": 9.276579946888506e-08, "logits/chosen": -2.0862667560577393, "logits/rejected": -2.1820881366729736, "logps/chosen": -1.0851240158081055, "logps/rejected": -1.2680546045303345, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9083461761474609, "rewards/margins": 0.0037519335746765137, "rewards/rejected": 0.9045942425727844, "step": 1470 }, { "epoch": 0.79, "learning_rate": 9.275448105540708e-08, "logits/chosen": -2.147397041320801, "logits/rejected": -2.2606372833251953, "logps/chosen": -2.108492851257324, "logps/rejected": -1.8956248760223389, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9038435220718384, "rewards/margins": 0.013433456420898438, "rewards/rejected": 0.8904100656509399, "step": 1471 }, { "epoch": 0.79, "learning_rate": 9.274315448622882e-08, "logits/chosen": -2.1157050132751465, "logits/rejected": -2.3364689350128174, "logps/chosen": -0.9539220333099365, "logps/rejected": -1.0128072500228882, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.710309624671936, "rewards/margins": 0.02976125478744507, "rewards/rejected": 0.680548369884491, "step": 1472 }, { "epoch": 0.79, "learning_rate": 9.27318197635109e-08, "logits/chosen": -2.0050227642059326, "logits/rejected": -2.006826877593994, "logps/chosen": -2.998047113418579, "logps/rejected": -4.715752124786377, "loss": 0.361, "rewards/accuracies": 1.0, "rewards/chosen": 1.4152536392211914, "rewards/margins": 0.8330504894256592, "rewards/rejected": 0.5822031497955322, "step": 1473 }, { "epoch": 0.8, "learning_rate": 9.272047688941549e-08, "logits/chosen": -2.1668171882629395, "logits/rejected": -2.163334846496582, "logps/chosen": -7.2042622566223145, "logps/rejected": -4.521297931671143, "loss": 0.3979, "rewards/accuracies": 1.0, "rewards/chosen": 1.216257929801941, "rewards/margins": 0.7161564230918884, "rewards/rejected": 0.5001015067100525, "step": 1474 }, { "epoch": 0.8, "learning_rate": 9.27091258661063e-08, "logits/chosen": -2.072957754135132, "logits/rejected": -2.227703809738159, "logps/chosen": -1.2622748613357544, "logps/rejected": -1.3630167245864868, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.7065807580947876, "rewards/margins": 0.03634566068649292, "rewards/rejected": 0.6702350974082947, "step": 1475 }, { "epoch": 0.8, "learning_rate": 9.269776669574865e-08, "logits/chosen": -2.008702278137207, "logits/rejected": -2.253840446472168, "logps/chosen": -0.47299471497535706, "logps/rejected": -0.5638421773910522, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 1.0223113298416138, "rewards/margins": 0.015528678894042969, "rewards/rejected": 1.0067826509475708, "step": 1476 }, { "epoch": 0.8, "learning_rate": 9.268639938050935e-08, "logits/chosen": -2.084077835083008, "logits/rejected": -2.2476892471313477, "logps/chosen": -0.5040795803070068, "logps/rejected": -0.5023093223571777, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.7510563135147095, "rewards/margins": 0.014848411083221436, "rewards/rejected": 0.736207902431488, "step": 1477 }, { "epoch": 0.8, "learning_rate": 9.267502392255679e-08, "logits/chosen": -2.0173566341400146, "logits/rejected": -2.265746831893921, "logps/chosen": -14.436151504516602, "logps/rejected": -8.749385833740234, "loss": 0.7104, "rewards/accuracies": 0.0, "rewards/chosen": 0.6384561657905579, "rewards/margins": -0.03414708375930786, "rewards/rejected": 0.6726032495498657, "step": 1478 }, { "epoch": 0.8, "learning_rate": 9.266364032406093e-08, "logits/chosen": -2.194805383682251, "logits/rejected": -2.1810309886932373, "logps/chosen": -11.399006843566895, "logps/rejected": -5.526670455932617, "loss": 0.5992, "rewards/accuracies": 1.0, "rewards/chosen": 1.1595427989959717, "rewards/margins": 0.19767552614212036, "rewards/rejected": 0.9618672728538513, "step": 1479 }, { "epoch": 0.8, "learning_rate": 9.265224858719326e-08, "logits/chosen": -2.0524415969848633, "logits/rejected": -2.226954460144043, "logps/chosen": -7.137359619140625, "logps/rejected": -8.54273509979248, "loss": 0.612, "rewards/accuracies": 1.0, "rewards/chosen": 0.9731205105781555, "rewards/margins": 0.1695488691329956, "rewards/rejected": 0.8035716414451599, "step": 1480 }, { "epoch": 0.8, "learning_rate": 9.264084871412683e-08, "logits/chosen": -1.9899253845214844, "logits/rejected": -1.9851967096328735, "logps/chosen": -4.266217231750488, "logps/rejected": -3.2373602390289307, "loss": 0.3856, "rewards/accuracies": 1.0, "rewards/chosen": 1.3858665227890015, "rewards/margins": 0.7540795207023621, "rewards/rejected": 0.6317870020866394, "step": 1481 }, { "epoch": 0.8, "learning_rate": 9.262944070703622e-08, "logits/chosen": -2.095689296722412, "logits/rejected": -2.2452054023742676, "logps/chosen": -0.3105698525905609, "logps/rejected": -0.3375783860683441, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.8290023803710938, "rewards/margins": 0.013916194438934326, "rewards/rejected": 0.8150861859321594, "step": 1482 }, { "epoch": 0.8, "learning_rate": 9.261802456809762e-08, "logits/chosen": -2.022179126739502, "logits/rejected": -2.0225982666015625, "logps/chosen": -4.2955546379089355, "logps/rejected": -1.0266742706298828, "loss": 0.5112, "rewards/accuracies": 1.0, "rewards/chosen": 1.3492802381515503, "rewards/margins": 0.4046156406402588, "rewards/rejected": 0.9446645975112915, "step": 1483 }, { "epoch": 0.8, "learning_rate": 9.260660029948872e-08, "logits/chosen": -2.234287738800049, "logits/rejected": -2.1402549743652344, "logps/chosen": -32.54594039916992, "logps/rejected": -3.3215315341949463, "loss": 0.3721, "rewards/accuracies": 1.0, "rewards/chosen": 1.3279956579208374, "rewards/margins": 0.796901285648346, "rewards/rejected": 0.5310943722724915, "step": 1484 }, { "epoch": 0.8, "learning_rate": 9.259516790338875e-08, "logits/chosen": -2.1833038330078125, "logits/rejected": -2.2465643882751465, "logps/chosen": -4.24738883972168, "logps/rejected": -12.542113304138184, "loss": 0.4857, "rewards/accuracies": 1.0, "rewards/chosen": 1.256968379020691, "rewards/margins": 0.469478964805603, "rewards/rejected": 0.7874894142150879, "step": 1485 }, { "epoch": 0.8, "learning_rate": 9.258372738197855e-08, "logits/chosen": -2.017319440841675, "logits/rejected": -2.2224159240722656, "logps/chosen": -1.0724842548370361, "logps/rejected": -0.9812526106834412, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.8558176159858704, "rewards/margins": 0.020582377910614014, "rewards/rejected": 0.8352352380752563, "step": 1486 }, { "epoch": 0.8, "learning_rate": 9.257227873744046e-08, "logits/chosen": -2.012056589126587, "logits/rejected": -2.017916202545166, "logps/chosen": -0.8704484701156616, "logps/rejected": -3.154420852661133, "loss": 0.507, "rewards/accuracies": 1.0, "rewards/chosen": 0.9701983332633972, "rewards/margins": 0.4151560664176941, "rewards/rejected": 0.5550422668457031, "step": 1487 }, { "epoch": 0.8, "learning_rate": 9.256082197195839e-08, "logits/chosen": -2.0707225799560547, "logits/rejected": -2.0885062217712402, "logps/chosen": -1.4107437133789062, "logps/rejected": -8.200299263000488, "loss": 0.4716, "rewards/accuracies": 1.0, "rewards/chosen": 1.1478667259216309, "rewards/margins": 0.5066075921058655, "rewards/rejected": 0.6412591338157654, "step": 1488 }, { "epoch": 0.8, "learning_rate": 9.254935708771778e-08, "logits/chosen": -1.9908511638641357, "logits/rejected": -2.0264391899108887, "logps/chosen": -9.400458335876465, "logps/rejected": -19.03261947631836, "loss": 0.3883, "rewards/accuracies": 1.0, "rewards/chosen": 1.0867360830307007, "rewards/margins": 0.7456502914428711, "rewards/rejected": 0.341085821390152, "step": 1489 }, { "epoch": 0.8, "learning_rate": 9.253788408690564e-08, "logits/chosen": -2.056269884109497, "logits/rejected": -2.2606711387634277, "logps/chosen": -8.030165672302246, "logps/rejected": -1.2856602668762207, "loss": 0.9299, "rewards/accuracies": 0.0, "rewards/chosen": 0.6519433856010437, "rewards/margins": -0.4280361533164978, "rewards/rejected": 1.0799795389175415, "step": 1490 }, { "epoch": 0.8, "learning_rate": 9.252640297171053e-08, "logits/chosen": -1.991044282913208, "logits/rejected": -1.9958614110946655, "logps/chosen": -2.1560304164886475, "logps/rejected": -3.3257346153259277, "loss": 0.528, "rewards/accuracies": 1.0, "rewards/chosen": 0.9236478805541992, "rewards/margins": 0.3631123900413513, "rewards/rejected": 0.5605354905128479, "step": 1491 }, { "epoch": 0.8, "learning_rate": 9.251491374432255e-08, "logits/chosen": -2.0906734466552734, "logits/rejected": -2.2302868366241455, "logps/chosen": -0.8130210638046265, "logps/rejected": -5.469423770904541, "loss": 0.7046, "rewards/accuracies": 0.0, "rewards/chosen": 0.7263586521148682, "rewards/margins": -0.022715985774993896, "rewards/rejected": 0.7490746378898621, "step": 1492 }, { "epoch": 0.81, "learning_rate": 9.250341640693331e-08, "logits/chosen": -2.003009796142578, "logits/rejected": -1.9946930408477783, "logps/chosen": -4.746711730957031, "logps/rejected": -4.422255992889404, "loss": 0.4527, "rewards/accuracies": 1.0, "rewards/chosen": 1.2366424798965454, "rewards/margins": 0.5577123165130615, "rewards/rejected": 0.6789301633834839, "step": 1493 }, { "epoch": 0.81, "learning_rate": 9.249191096173605e-08, "logits/chosen": -2.0839242935180664, "logits/rejected": -2.271293878555298, "logps/chosen": -2.0580334663391113, "logps/rejected": -1.928803563117981, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.998974621295929, "rewards/margins": -0.006873071193695068, "rewards/rejected": 1.005847692489624, "step": 1494 }, { "epoch": 0.81, "learning_rate": 9.24803974109255e-08, "logits/chosen": -2.0478715896606445, "logits/rejected": -2.0521459579467773, "logps/chosen": -2.9221904277801514, "logps/rejected": -11.470727920532227, "loss": 0.4231, "rewards/accuracies": 1.0, "rewards/chosen": 1.0698045492172241, "rewards/margins": 0.6412241458892822, "rewards/rejected": 0.4285803735256195, "step": 1495 }, { "epoch": 0.81, "learning_rate": 9.246887575669793e-08, "logits/chosen": -1.9746854305267334, "logits/rejected": -2.274502754211426, "logps/chosen": -0.8560088872909546, "logps/rejected": -0.8244843482971191, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.752305269241333, "rewards/margins": 0.0232275128364563, "rewards/rejected": 0.7290777564048767, "step": 1496 }, { "epoch": 0.81, "learning_rate": 9.245734600125116e-08, "logits/chosen": -2.033759355545044, "logits/rejected": -2.0382156372070312, "logps/chosen": -0.348807692527771, "logps/rejected": -4.921619415283203, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": 0.9252762794494629, "rewards/margins": 0.3293014168739319, "rewards/rejected": 0.595974862575531, "step": 1497 }, { "epoch": 0.81, "learning_rate": 9.244580814678463e-08, "logits/chosen": -2.051420211791992, "logits/rejected": -2.050847053527832, "logps/chosen": -3.231767416000366, "logps/rejected": -3.920158624649048, "loss": 0.3645, "rewards/accuracies": 1.0, "rewards/chosen": 1.387447714805603, "rewards/margins": 0.8212864398956299, "rewards/rejected": 0.5661612749099731, "step": 1498 }, { "epoch": 0.81, "learning_rate": 9.243426219549916e-08, "logits/chosen": -2.0386722087860107, "logits/rejected": -2.3175480365753174, "logps/chosen": -5.096799850463867, "logps/rejected": -4.969272136688232, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": 1.0834574699401855, "rewards/margins": -0.00936591625213623, "rewards/rejected": 1.0928233861923218, "step": 1499 }, { "epoch": 0.81, "learning_rate": 9.242270814959731e-08, "logits/chosen": -1.9758391380310059, "logits/rejected": -1.9727787971496582, "logps/chosen": -4.847891807556152, "logps/rejected": -1.730703353881836, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 1.043120265007019, "rewards/margins": -0.004297494888305664, "rewards/rejected": 1.0474177598953247, "step": 1500 }, { "epoch": 0.81, "learning_rate": 9.241114601128305e-08, "logits/chosen": -2.1177620887756348, "logits/rejected": -2.0032427310943604, "logps/chosen": -28.222440719604492, "logps/rejected": -2.8082313537597656, "loss": 0.4706, "rewards/accuracies": 1.0, "rewards/chosen": 1.125032663345337, "rewards/margins": 0.5092918872833252, "rewards/rejected": 0.6157407760620117, "step": 1501 }, { "epoch": 0.81, "learning_rate": 9.239957578276197e-08, "logits/chosen": -2.0453155040740967, "logits/rejected": -2.288268566131592, "logps/chosen": -3.615565299987793, "logps/rejected": -1.179526686668396, "loss": 0.7877, "rewards/accuracies": 0.0, "rewards/chosen": 0.8855797648429871, "rewards/margins": -0.18088418245315552, "rewards/rejected": 1.0664639472961426, "step": 1502 }, { "epoch": 0.81, "learning_rate": 9.23879974662411e-08, "logits/chosen": -2.076134443283081, "logits/rejected": -2.2717857360839844, "logps/chosen": -2.2671313285827637, "logps/rejected": -2.654576063156128, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.8868749737739563, "rewards/margins": 0.026407063007354736, "rewards/rejected": 0.8604679107666016, "step": 1503 }, { "epoch": 0.81, "learning_rate": 9.237641106392912e-08, "logits/chosen": -2.036980152130127, "logits/rejected": -2.2657012939453125, "logps/chosen": -0.6942741870880127, "logps/rejected": -0.6918332576751709, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": 0.8811268210411072, "rewards/margins": -0.015588819980621338, "rewards/rejected": 0.8967156410217285, "step": 1504 }, { "epoch": 0.81, "learning_rate": 9.236481657803622e-08, "logits/chosen": -2.2353579998016357, "logits/rejected": -2.2452285289764404, "logps/chosen": -5.773466110229492, "logps/rejected": -5.116386890411377, "loss": 0.4582, "rewards/accuracies": 1.0, "rewards/chosen": 1.0564626455307007, "rewards/margins": 0.5426324009895325, "rewards/rejected": 0.5138302445411682, "step": 1505 }, { "epoch": 0.81, "learning_rate": 9.235321401077412e-08, "logits/chosen": -2.0780982971191406, "logits/rejected": -2.275541305541992, "logps/chosen": -1.3484734296798706, "logps/rejected": -1.498616337776184, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 1.0518553256988525, "rewards/margins": 0.02042973041534424, "rewards/rejected": 1.0314255952835083, "step": 1506 }, { "epoch": 0.81, "learning_rate": 9.234160336435608e-08, "logits/chosen": -2.081404209136963, "logits/rejected": -2.081740379333496, "logps/chosen": -1.3756959438323975, "logps/rejected": -4.024061679840088, "loss": 0.5069, "rewards/accuracies": 1.0, "rewards/chosen": 1.037514567375183, "rewards/margins": 0.4153982400894165, "rewards/rejected": 0.6221163272857666, "step": 1507 }, { "epoch": 0.81, "learning_rate": 9.232998464099691e-08, "logits/chosen": -1.842716097831726, "logits/rejected": -2.2609915733337402, "logps/chosen": -1.6513159275054932, "logps/rejected": -1.9278478622436523, "loss": 0.6582, "rewards/accuracies": 1.0, "rewards/chosen": 0.6974257826805115, "rewards/margins": 0.07107716798782349, "rewards/rejected": 0.626348614692688, "step": 1508 }, { "epoch": 0.81, "learning_rate": 9.231835784291296e-08, "logits/chosen": -2.172391414642334, "logits/rejected": -2.2923848628997803, "logps/chosen": -5.265344142913818, "logps/rejected": -0.532910943031311, "loss": 0.7986, "rewards/accuracies": 0.0, "rewards/chosen": 0.8264774680137634, "rewards/margins": -0.20080000162124634, "rewards/rejected": 1.0272774696350098, "step": 1509 }, { "epoch": 0.81, "learning_rate": 9.23067229723221e-08, "logits/chosen": -2.042775869369507, "logits/rejected": -2.0905871391296387, "logps/chosen": -4.21317720413208, "logps/rejected": -18.99261474609375, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 1.3853912353515625, "rewards/margins": 1.4383494853973389, "rewards/rejected": -0.05295829847455025, "step": 1510 }, { "epoch": 0.81, "learning_rate": 9.22950800314438e-08, "logits/chosen": -2.025974750518799, "logits/rejected": -2.2817885875701904, "logps/chosen": -0.6331432461738586, "logps/rejected": -0.7088415622711182, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.966903805732727, "rewards/margins": -0.0024590492248535156, "rewards/rejected": 0.9693628549575806, "step": 1511 }, { "epoch": 0.82, "learning_rate": 9.2283429022499e-08, "logits/chosen": -1.9972944259643555, "logits/rejected": -2.3126776218414307, "logps/chosen": -3.2837326526641846, "logps/rejected": -3.5521950721740723, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.9327849745750427, "rewards/margins": -0.0015084147453308105, "rewards/rejected": 0.9342933893203735, "step": 1512 }, { "epoch": 0.82, "learning_rate": 9.227176994771019e-08, "logits/chosen": -2.0428507328033447, "logits/rejected": -2.272089958190918, "logps/chosen": -11.265541076660156, "logps/rejected": -11.038662910461426, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.26583367586135864, "rewards/margins": -0.008126050233840942, "rewards/rejected": 0.2739597260951996, "step": 1513 }, { "epoch": 0.82, "learning_rate": 9.226010280930146e-08, "logits/chosen": -2.05560564994812, "logits/rejected": -2.322606325149536, "logps/chosen": -0.9506674408912659, "logps/rejected": -0.9605401754379272, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 1.1208388805389404, "rewards/margins": 0.014493823051452637, "rewards/rejected": 1.1063450574874878, "step": 1514 }, { "epoch": 0.82, "learning_rate": 9.224842760949835e-08, "logits/chosen": -2.093477249145508, "logits/rejected": -2.092597007751465, "logps/chosen": -1.3914531469345093, "logps/rejected": -3.288304090499878, "loss": 0.5262, "rewards/accuracies": 1.0, "rewards/chosen": 0.9651827216148376, "rewards/margins": 0.3674386143684387, "rewards/rejected": 0.5977441072463989, "step": 1515 }, { "epoch": 0.82, "learning_rate": 9.223674435052802e-08, "logits/chosen": -2.202401638031006, "logits/rejected": -2.1984708309173584, "logps/chosen": -1.5493441820144653, "logps/rejected": -3.9858336448669434, "loss": 0.5241, "rewards/accuracies": 1.0, "rewards/chosen": 1.0073355436325073, "rewards/margins": 0.3725373148918152, "rewards/rejected": 0.6347982287406921, "step": 1516 }, { "epoch": 0.82, "learning_rate": 9.22250530346191e-08, "logits/chosen": -2.1245784759521484, "logits/rejected": -2.119839906692505, "logps/chosen": -3.763456106185913, "logps/rejected": -3.9858486652374268, "loss": 0.5542, "rewards/accuracies": 1.0, "rewards/chosen": 0.9630399942398071, "rewards/margins": 0.3004600405693054, "rewards/rejected": 0.6625799536705017, "step": 1517 }, { "epoch": 0.82, "learning_rate": 9.22133536640018e-08, "logits/chosen": -1.9954198598861694, "logits/rejected": -2.2499592304229736, "logps/chosen": -1.79124915599823, "logps/rejected": -1.4816153049468994, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.8597485423088074, "rewards/margins": -0.0016345381736755371, "rewards/rejected": 0.8613830804824829, "step": 1518 }, { "epoch": 0.82, "learning_rate": 9.220164624090785e-08, "logits/chosen": -2.1143815517425537, "logits/rejected": -2.2930774688720703, "logps/chosen": -7.0428314208984375, "logps/rejected": -6.539368629455566, "loss": 0.7068, "rewards/accuracies": 0.0, "rewards/chosen": 0.47207412123680115, "rewards/margins": -0.027173638343811035, "rewards/rejected": 0.4992477595806122, "step": 1519 }, { "epoch": 0.82, "learning_rate": 9.218993076757053e-08, "logits/chosen": -2.0518133640289307, "logits/rejected": -2.041531801223755, "logps/chosen": -13.856161117553711, "logps/rejected": -3.2624311447143555, "loss": 0.3803, "rewards/accuracies": 1.0, "rewards/chosen": 1.304027795791626, "rewards/margins": 0.7707024812698364, "rewards/rejected": 0.5333253145217896, "step": 1520 }, { "epoch": 0.82, "learning_rate": 9.217820724622462e-08, "logits/chosen": -2.133249282836914, "logits/rejected": -2.214776039123535, "logps/chosen": -0.8122364282608032, "logps/rejected": -0.825279951095581, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.7499318718910217, "rewards/margins": 0.012899339199066162, "rewards/rejected": 0.7370325326919556, "step": 1521 }, { "epoch": 0.82, "learning_rate": 9.216647567910646e-08, "logits/chosen": -2.1339492797851562, "logits/rejected": -2.1368958950042725, "logps/chosen": -2.6352994441986084, "logps/rejected": -2.606112003326416, "loss": 0.5415, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750210046768188, "rewards/margins": 0.3304869532585144, "rewards/rejected": 0.6445340514183044, "step": 1522 }, { "epoch": 0.82, "learning_rate": 9.215473606845396e-08, "logits/chosen": -1.9742604494094849, "logits/rejected": -1.953048586845398, "logps/chosen": -14.970504760742188, "logps/rejected": -6.603425979614258, "loss": 0.456, "rewards/accuracies": 1.0, "rewards/chosen": 1.3073490858078003, "rewards/margins": 0.5486019849777222, "rewards/rejected": 0.7587471008300781, "step": 1523 }, { "epoch": 0.82, "learning_rate": 9.21429884165065e-08, "logits/chosen": -2.0507490634918213, "logits/rejected": -2.2586214542388916, "logps/chosen": -3.026637077331543, "logps/rejected": -2.653960943222046, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.6753283739089966, "rewards/margins": -0.0069931745529174805, "rewards/rejected": 0.6823215484619141, "step": 1524 }, { "epoch": 0.82, "learning_rate": 9.2131232725505e-08, "logits/chosen": -2.0414650440216064, "logits/rejected": -2.040008783340454, "logps/chosen": -0.7112859487533569, "logps/rejected": -1.6533480882644653, "loss": 0.5772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9776104092597961, "rewards/margins": 0.2470596432685852, "rewards/rejected": 0.7305507659912109, "step": 1525 }, { "epoch": 0.82, "learning_rate": 9.211946899769198e-08, "logits/chosen": -2.1122190952301025, "logits/rejected": -2.0891358852386475, "logps/chosen": -17.64504623413086, "logps/rejected": -1.4457933902740479, "loss": 0.483, "rewards/accuracies": 1.0, "rewards/chosen": 1.4284822940826416, "rewards/margins": 0.47642260789871216, "rewards/rejected": 0.9520596861839294, "step": 1526 }, { "epoch": 0.82, "learning_rate": 9.210769723531145e-08, "logits/chosen": -2.1395678520202637, "logits/rejected": -2.2636353969573975, "logps/chosen": -0.4103657603263855, "logps/rejected": -0.4221994876861572, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.9222325682640076, "rewards/margins": 0.024713754653930664, "rewards/rejected": 0.8975188136100769, "step": 1527 }, { "epoch": 0.82, "learning_rate": 9.209591744060892e-08, "logits/chosen": -2.0712692737579346, "logits/rejected": -2.0739431381225586, "logps/chosen": -3.968672275543213, "logps/rejected": -4.821367263793945, "loss": 0.3805, "rewards/accuracies": 1.0, "rewards/chosen": 1.3751691579818726, "rewards/margins": 0.7700132131576538, "rewards/rejected": 0.6051559448242188, "step": 1528 }, { "epoch": 0.82, "learning_rate": 9.208412961583145e-08, "logits/chosen": -2.0734851360321045, "logits/rejected": -2.0769846439361572, "logps/chosen": -0.48875048756599426, "logps/rejected": -3.0628278255462646, "loss": 0.4919, "rewards/accuracies": 1.0, "rewards/chosen": 1.0084213018417358, "rewards/margins": 0.45358121395111084, "rewards/rejected": 0.554840087890625, "step": 1529 }, { "epoch": 0.83, "learning_rate": 9.20723337632277e-08, "logits/chosen": -2.085764169692993, "logits/rejected": -2.104426860809326, "logps/chosen": -6.407054901123047, "logps/rejected": -2.8525948524475098, "loss": 0.5777, "rewards/accuracies": 1.0, "rewards/chosen": 1.0656893253326416, "rewards/margins": 0.2460048794746399, "rewards/rejected": 0.8196844458580017, "step": 1530 }, { "epoch": 0.83, "learning_rate": 9.206052988504778e-08, "logits/chosen": -1.993247389793396, "logits/rejected": -2.232612371444702, "logps/chosen": -2.5371954441070557, "logps/rejected": -2.4272823333740234, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.6736281514167786, "rewards/margins": 0.03775817155838013, "rewards/rejected": 0.6358699798583984, "step": 1531 }, { "epoch": 0.83, "learning_rate": 9.204871798354334e-08, "logits/chosen": -2.0661375522613525, "logits/rejected": -2.0646705627441406, "logps/chosen": -3.39837908744812, "logps/rejected": -3.812584400177002, "loss": 0.6002, "rewards/accuracies": 1.0, "rewards/chosen": 0.8953439593315125, "rewards/margins": 0.19539642333984375, "rewards/rejected": 0.6999475359916687, "step": 1532 }, { "epoch": 0.83, "learning_rate": 9.203689806096761e-08, "logits/chosen": -2.1037821769714355, "logits/rejected": -2.3038716316223145, "logps/chosen": -0.8799388408660889, "logps/rejected": -1.0085375308990479, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 1.0083733797073364, "rewards/margins": 0.008954167366027832, "rewards/rejected": 0.9994192123413086, "step": 1533 }, { "epoch": 0.83, "learning_rate": 9.20250701195753e-08, "logits/chosen": -2.1939284801483154, "logits/rejected": -2.1905622482299805, "logps/chosen": -3.5256457328796387, "logps/rejected": -4.065844535827637, "loss": 0.5342, "rewards/accuracies": 1.0, "rewards/chosen": 0.9543636441230774, "rewards/margins": 0.34793949127197266, "rewards/rejected": 0.6064241528511047, "step": 1534 }, { "epoch": 0.83, "learning_rate": 9.201323416162268e-08, "logits/chosen": -2.1818289756774902, "logits/rejected": -2.00976824760437, "logps/chosen": -29.45648193359375, "logps/rejected": -3.583041191101074, "loss": 0.3238, "rewards/accuracies": 1.0, "rewards/chosen": 1.6093708276748657, "rewards/margins": 0.9612643122673035, "rewards/rejected": 0.6481065154075623, "step": 1535 }, { "epoch": 0.83, "learning_rate": 9.200139018936752e-08, "logits/chosen": -2.085951805114746, "logits/rejected": -2.0944302082061768, "logps/chosen": -6.599133491516113, "logps/rejected": -3.117438316345215, "loss": 0.3374, "rewards/accuracies": 1.0, "rewards/chosen": 1.5460337400436401, "rewards/margins": 0.913115918636322, "rewards/rejected": 0.6329178214073181, "step": 1536 }, { "epoch": 0.83, "learning_rate": 9.198953820506914e-08, "logits/chosen": -2.109666347503662, "logits/rejected": -2.0091888904571533, "logps/chosen": -30.256427764892578, "logps/rejected": -3.433532476425171, "loss": 0.3017, "rewards/accuracies": 1.0, "rewards/chosen": 1.6106449365615845, "rewards/margins": 1.0436636209487915, "rewards/rejected": 0.566981315612793, "step": 1537 }, { "epoch": 0.83, "learning_rate": 9.197767821098839e-08, "logits/chosen": -2.076089382171631, "logits/rejected": -2.0402281284332275, "logps/chosen": -6.66531229019165, "logps/rejected": -1.748190999031067, "loss": 0.4026, "rewards/accuracies": 1.0, "rewards/chosen": 1.4589389562606812, "rewards/margins": 0.7018043994903564, "rewards/rejected": 0.7571345567703247, "step": 1538 }, { "epoch": 0.83, "learning_rate": 9.196581020938765e-08, "logits/chosen": -2.041821002960205, "logits/rejected": -2.0497281551361084, "logps/chosen": -1.6398661136627197, "logps/rejected": -2.3044543266296387, "loss": 0.5088, "rewards/accuracies": 1.0, "rewards/chosen": 1.022639513015747, "rewards/margins": 0.41061800718307495, "rewards/rejected": 0.6120215058326721, "step": 1539 }, { "epoch": 0.83, "learning_rate": 9.195393420253081e-08, "logits/chosen": -2.0649383068084717, "logits/rejected": -2.1980738639831543, "logps/chosen": -0.5435900688171387, "logps/rejected": -0.5563231706619263, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.9039499163627625, "rewards/margins": 0.019026637077331543, "rewards/rejected": 0.8849232792854309, "step": 1540 }, { "epoch": 0.83, "learning_rate": 9.19420501926833e-08, "logits/chosen": -2.1716647148132324, "logits/rejected": -2.1379759311676025, "logps/chosen": -20.542768478393555, "logps/rejected": -2.8273797035217285, "loss": 0.3718, "rewards/accuracies": 1.0, "rewards/chosen": 1.3992258310317993, "rewards/margins": 0.7978939414024353, "rewards/rejected": 0.601331889629364, "step": 1541 }, { "epoch": 0.83, "learning_rate": 9.193015818211208e-08, "logits/chosen": -2.1061317920684814, "logits/rejected": -2.2560155391693115, "logps/chosen": -0.38143202662467957, "logps/rejected": -0.42301124334335327, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.9065364003181458, "rewards/margins": -0.007926344871520996, "rewards/rejected": 0.9144627451896667, "step": 1542 }, { "epoch": 0.83, "learning_rate": 9.191825817308561e-08, "logits/chosen": -2.0891151428222656, "logits/rejected": -2.1367673873901367, "logps/chosen": -9.259504318237305, "logps/rejected": -8.09068775177002, "loss": 0.4982, "rewards/accuracies": 1.0, "rewards/chosen": 1.2718199491500854, "rewards/margins": 0.4372560381889343, "rewards/rejected": 0.8345639109611511, "step": 1543 }, { "epoch": 0.83, "learning_rate": 9.190635016787391e-08, "logits/chosen": -1.983208179473877, "logits/rejected": -1.9957070350646973, "logps/chosen": -2.6237432956695557, "logps/rejected": -8.327065467834473, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": 1.0444462299346924, "rewards/margins": 0.5566304922103882, "rewards/rejected": 0.4878157675266266, "step": 1544 }, { "epoch": 0.83, "learning_rate": 9.189443416874851e-08, "logits/chosen": -2.0247864723205566, "logits/rejected": -2.021009683609009, "logps/chosen": -7.2573466300964355, "logps/rejected": -1.8392915725708008, "loss": 0.5804, "rewards/accuracies": 1.0, "rewards/chosen": 0.9922224283218384, "rewards/margins": 0.2399119734764099, "rewards/rejected": 0.7523104548454285, "step": 1545 }, { "epoch": 0.83, "learning_rate": 9.188251017798248e-08, "logits/chosen": -2.0333802700042725, "logits/rejected": -2.0376009941101074, "logps/chosen": -1.8942848443984985, "logps/rejected": -4.910943984985352, "loss": 0.4807, "rewards/accuracies": 1.0, "rewards/chosen": 1.1128698587417603, "rewards/margins": 0.4826807975769043, "rewards/rejected": 0.630189061164856, "step": 1546 }, { "epoch": 0.83, "learning_rate": 9.187057819785037e-08, "logits/chosen": -1.9170118570327759, "logits/rejected": -1.918153166770935, "logps/chosen": -1.5296013355255127, "logps/rejected": -3.0625734329223633, "loss": 0.5119, "rewards/accuracies": 1.0, "rewards/chosen": 1.0462831258773804, "rewards/margins": 0.402656614780426, "rewards/rejected": 0.6436265110969543, "step": 1547 }, { "epoch": 0.83, "learning_rate": 9.185863823062829e-08, "logits/chosen": -2.169904947280884, "logits/rejected": -2.037158966064453, "logps/chosen": -38.363399505615234, "logps/rejected": -5.085141181945801, "loss": 0.3501, "rewards/accuracies": 1.0, "rewards/chosen": 1.3656116724014282, "rewards/margins": 0.8694052696228027, "rewards/rejected": 0.4962063729763031, "step": 1548 }, { "epoch": 0.84, "learning_rate": 9.184669027859389e-08, "logits/chosen": -1.983075737953186, "logits/rejected": -1.9716747999191284, "logps/chosen": -1.2307236194610596, "logps/rejected": -3.497229814529419, "loss": 0.5481, "rewards/accuracies": 1.0, "rewards/chosen": 0.9857354164123535, "rewards/margins": 0.31471794843673706, "rewards/rejected": 0.6710174679756165, "step": 1549 }, { "epoch": 0.84, "learning_rate": 9.183473434402631e-08, "logits/chosen": -2.1074702739715576, "logits/rejected": -2.29235577583313, "logps/chosen": -3.588425397872925, "logps/rejected": -3.5105907917022705, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.7371266484260559, "rewards/margins": 0.005360960960388184, "rewards/rejected": 0.7317656874656677, "step": 1550 }, { "epoch": 0.84, "learning_rate": 9.18227704292062e-08, "logits/chosen": -2.0967276096343994, "logits/rejected": -2.090543031692505, "logps/chosen": -3.1941068172454834, "logps/rejected": -3.7174692153930664, "loss": 0.554, "rewards/accuracies": 1.0, "rewards/chosen": 1.188417673110962, "rewards/margins": 0.3007175922393799, "rewards/rejected": 0.887700080871582, "step": 1551 }, { "epoch": 0.84, "learning_rate": 9.18107985364158e-08, "logits/chosen": -2.0837347507476807, "logits/rejected": -2.261489152908325, "logps/chosen": -1.353402853012085, "logps/rejected": -5.98823881149292, "loss": 0.5863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9922050833702087, "rewards/margins": 0.2263948917388916, "rewards/rejected": 0.7658101916313171, "step": 1552 }, { "epoch": 0.84, "learning_rate": 9.179881866793879e-08, "logits/chosen": -2.0363078117370605, "logits/rejected": -2.2809648513793945, "logps/chosen": -1.418224811553955, "logps/rejected": -4.693129539489746, "loss": 0.6518, "rewards/accuracies": 1.0, "rewards/chosen": 1.0310087203979492, "rewards/margins": 0.08448761701583862, "rewards/rejected": 0.9465211033821106, "step": 1553 }, { "epoch": 0.84, "learning_rate": 9.178683082606044e-08, "logits/chosen": -2.177351474761963, "logits/rejected": -2.369396448135376, "logps/chosen": -17.200435638427734, "logps/rejected": -17.668066024780273, "loss": 0.6041, "rewards/accuracies": 1.0, "rewards/chosen": 1.0595073699951172, "rewards/margins": 0.1867736577987671, "rewards/rejected": 0.8727337121963501, "step": 1554 }, { "epoch": 0.84, "learning_rate": 9.177483501306749e-08, "logits/chosen": -2.1352126598358154, "logits/rejected": -2.1297733783721924, "logps/chosen": -7.525429725646973, "logps/rejected": -4.572927474975586, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.042728304862976, "rewards/margins": 0.6039701700210571, "rewards/rejected": 0.43875810503959656, "step": 1555 }, { "epoch": 0.84, "learning_rate": 9.176283123124822e-08, "logits/chosen": -2.0718026161193848, "logits/rejected": -2.259840726852417, "logps/chosen": -0.39995452761650085, "logps/rejected": -0.39459002017974854, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8934361338615417, "rewards/margins": 0.019520819187164307, "rewards/rejected": 0.8739153146743774, "step": 1556 }, { "epoch": 0.84, "learning_rate": 9.175081948289243e-08, "logits/chosen": -2.296968698501587, "logits/rejected": -2.1654396057128906, "logps/chosen": -42.79576873779297, "logps/rejected": -1.4749666452407837, "loss": 0.3469, "rewards/accuracies": 1.0, "rewards/chosen": 1.807490587234497, "rewards/margins": 0.8803697824478149, "rewards/rejected": 0.9271208047866821, "step": 1557 }, { "epoch": 0.84, "learning_rate": 9.173879977029146e-08, "logits/chosen": -2.046564817428589, "logits/rejected": -2.0502829551696777, "logps/chosen": -5.421725273132324, "logps/rejected": -3.336818218231201, "loss": 0.5863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9077891707420349, "rewards/margins": 0.22646570205688477, "rewards/rejected": 0.6813234686851501, "step": 1558 }, { "epoch": 0.84, "learning_rate": 9.172677209573811e-08, "logits/chosen": -2.321544885635376, "logits/rejected": -2.1834607124328613, "logps/chosen": -27.466157913208008, "logps/rejected": -1.7370394468307495, "loss": 0.3077, "rewards/accuracies": 1.0, "rewards/chosen": 1.6660746335983276, "rewards/margins": 1.020799994468689, "rewards/rejected": 0.6452746391296387, "step": 1559 }, { "epoch": 0.84, "learning_rate": 9.171473646152679e-08, "logits/chosen": -2.128415822982788, "logits/rejected": -2.1361606121063232, "logps/chosen": -2.191401720046997, "logps/rejected": -3.567631721496582, "loss": 0.4812, "rewards/accuracies": 1.0, "rewards/chosen": 1.046015977859497, "rewards/margins": 0.4812160134315491, "rewards/rejected": 0.564799964427948, "step": 1560 }, { "epoch": 0.84, "learning_rate": 9.170269286995335e-08, "logits/chosen": -2.1677324771881104, "logits/rejected": -2.311429262161255, "logps/chosen": -0.6760841012001038, "logps/rejected": -0.7627420425415039, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.924416184425354, "rewards/margins": 0.014636635780334473, "rewards/rejected": 0.9097795486450195, "step": 1561 }, { "epoch": 0.84, "learning_rate": 9.16906413233152e-08, "logits/chosen": -2.0579872131347656, "logits/rejected": -2.2038981914520264, "logps/chosen": -0.9859431982040405, "logps/rejected": -0.8896793127059937, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.7956157326698303, "rewards/margins": 0.02058541774749756, "rewards/rejected": 0.7750303149223328, "step": 1562 }, { "epoch": 0.84, "learning_rate": 9.16785818239112e-08, "logits/chosen": -1.9964807033538818, "logits/rejected": -2.1754987239837646, "logps/chosen": -1.4152307510375977, "logps/rejected": -1.360444188117981, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.8783387541770935, "rewards/margins": 0.0006172657012939453, "rewards/rejected": 0.8777214884757996, "step": 1563 }, { "epoch": 0.84, "learning_rate": 9.166651437404184e-08, "logits/chosen": -2.0214505195617676, "logits/rejected": -2.022745370864868, "logps/chosen": -3.1007351875305176, "logps/rejected": -0.8287968635559082, "loss": 0.649, "rewards/accuracies": 1.0, "rewards/chosen": 1.0474598407745361, "rewards/margins": 0.09041696786880493, "rewards/rejected": 0.9570428729057312, "step": 1564 }, { "epoch": 0.84, "learning_rate": 9.165443897600903e-08, "logits/chosen": -2.043006420135498, "logits/rejected": -2.2660491466522217, "logps/chosen": -1.1336332559585571, "logps/rejected": -1.018667221069336, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.7916561961174011, "rewards/margins": 0.011787950992584229, "rewards/rejected": 0.7798682451248169, "step": 1565 }, { "epoch": 0.84, "learning_rate": 9.164235563211627e-08, "logits/chosen": -1.918218731880188, "logits/rejected": -1.9178608655929565, "logps/chosen": -0.9443439245223999, "logps/rejected": -1.4826666116714478, "loss": 0.7059, "rewards/accuracies": 0.0, "rewards/chosen": 0.8341770172119141, "rewards/margins": -0.025255203247070312, "rewards/rejected": 0.8594322204589844, "step": 1566 }, { "epoch": 0.85, "learning_rate": 9.163026434466849e-08, "logits/chosen": -2.243300199508667, "logits/rejected": -2.191096067428589, "logps/chosen": -14.504841804504395, "logps/rejected": -9.558767318725586, "loss": 0.2712, "rewards/accuracies": 1.0, "rewards/chosen": 1.692179560661316, "rewards/margins": 1.1661574840545654, "rewards/rejected": 0.5260221362113953, "step": 1567 }, { "epoch": 0.85, "learning_rate": 9.161816511597223e-08, "logits/chosen": -2.079209089279175, "logits/rejected": -2.280865430831909, "logps/chosen": -8.487448692321777, "logps/rejected": -0.5732698440551758, "loss": 0.6671, "rewards/accuracies": 1.0, "rewards/chosen": 1.0016459226608276, "rewards/margins": 0.05284684896469116, "rewards/rejected": 0.9487990736961365, "step": 1568 }, { "epoch": 0.85, "learning_rate": 9.160605794833547e-08, "logits/chosen": -2.2040045261383057, "logits/rejected": -2.0640029907226562, "logps/chosen": -41.35224914550781, "logps/rejected": -1.5607548952102661, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": 2.124140501022339, "rewards/margins": 1.1553455591201782, "rewards/rejected": 0.9687949419021606, "step": 1569 }, { "epoch": 0.85, "learning_rate": 9.159394284406773e-08, "logits/chosen": -2.0529067516326904, "logits/rejected": -2.043246030807495, "logps/chosen": -5.223992347717285, "logps/rejected": -3.032583236694336, "loss": 0.5509, "rewards/accuracies": 1.0, "rewards/chosen": 1.0254415273666382, "rewards/margins": 0.30809444189071655, "rewards/rejected": 0.7173470854759216, "step": 1570 }, { "epoch": 0.85, "learning_rate": 9.158181980548005e-08, "logits/chosen": -1.9771745204925537, "logits/rejected": -1.9950158596038818, "logps/chosen": -1.955841064453125, "logps/rejected": -6.36826229095459, "loss": 0.3808, "rewards/accuracies": 1.0, "rewards/chosen": 1.2365541458129883, "rewards/margins": 0.7691327333450317, "rewards/rejected": 0.46742144227027893, "step": 1571 }, { "epoch": 0.85, "learning_rate": 9.156968883488498e-08, "logits/chosen": -2.268282413482666, "logits/rejected": -2.130039930343628, "logps/chosen": -36.029815673828125, "logps/rejected": -3.4585342407226562, "loss": 0.3546, "rewards/accuracies": 1.0, "rewards/chosen": 1.288369059562683, "rewards/margins": 0.8543475866317749, "rewards/rejected": 0.4340214729309082, "step": 1572 }, { "epoch": 0.85, "learning_rate": 9.155754993459662e-08, "logits/chosen": -2.1618053913116455, "logits/rejected": -2.159060478210449, "logps/chosen": -5.455486297607422, "logps/rejected": -4.554986476898193, "loss": 0.2998, "rewards/accuracies": 1.0, "rewards/chosen": 1.5034723281860352, "rewards/margins": 1.051051378250122, "rewards/rejected": 0.4524209499359131, "step": 1573 }, { "epoch": 0.85, "learning_rate": 9.154540310693047e-08, "logits/chosen": -2.1078686714172363, "logits/rejected": -2.278923988342285, "logps/chosen": -0.7108010649681091, "logps/rejected": -0.6956033706665039, "loss": 0.6747, "rewards/accuracies": 1.0, "rewards/chosen": 0.810003936290741, "rewards/margins": 0.03724116086959839, "rewards/rejected": 0.7727627754211426, "step": 1574 }, { "epoch": 0.85, "learning_rate": 9.153324835420369e-08, "logits/chosen": -2.0170891284942627, "logits/rejected": -2.2495434284210205, "logps/chosen": -0.7260084748268127, "logps/rejected": -0.75615394115448, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9641939997673035, "rewards/margins": 0.050354957580566406, "rewards/rejected": 0.9138390421867371, "step": 1575 }, { "epoch": 0.85, "learning_rate": 9.152108567873484e-08, "logits/chosen": -2.101320743560791, "logits/rejected": -2.1006174087524414, "logps/chosen": -5.448637962341309, "logps/rejected": -5.625411033630371, "loss": 0.3701, "rewards/accuracies": 1.0, "rewards/chosen": 1.27363121509552, "rewards/margins": 0.8033068180084229, "rewards/rejected": 0.47032442688941956, "step": 1576 }, { "epoch": 0.85, "learning_rate": 9.150891508284403e-08, "logits/chosen": -2.0428760051727295, "logits/rejected": -2.0363731384277344, "logps/chosen": -16.5466251373291, "logps/rejected": -8.601693153381348, "loss": 0.3434, "rewards/accuracies": 1.0, "rewards/chosen": 1.223145842552185, "rewards/margins": 0.8921604156494141, "rewards/rejected": 0.3309854567050934, "step": 1577 }, { "epoch": 0.85, "learning_rate": 9.149673656885291e-08, "logits/chosen": -2.025449275970459, "logits/rejected": -2.02541446685791, "logps/chosen": -4.127022743225098, "logps/rejected": -5.61021614074707, "loss": 0.4677, "rewards/accuracies": 1.0, "rewards/chosen": 1.0769917964935303, "rewards/margins": 0.5171001553535461, "rewards/rejected": 0.5598916411399841, "step": 1578 }, { "epoch": 0.85, "learning_rate": 9.148455013908458e-08, "logits/chosen": -2.0032737255096436, "logits/rejected": -2.272568941116333, "logps/chosen": -0.7401665449142456, "logps/rejected": -0.8062129020690918, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.7958385348320007, "rewards/margins": 0.01733255386352539, "rewards/rejected": 0.7785059809684753, "step": 1579 }, { "epoch": 0.85, "learning_rate": 9.14723557958637e-08, "logits/chosen": -1.963661789894104, "logits/rejected": -2.2718358039855957, "logps/chosen": -2.6024677753448486, "logps/rejected": -10.439499855041504, "loss": 0.6436, "rewards/accuracies": 1.0, "rewards/chosen": 0.9097005724906921, "rewards/margins": 0.10174721479415894, "rewards/rejected": 0.8079533576965332, "step": 1580 }, { "epoch": 0.85, "learning_rate": 9.146015354151642e-08, "logits/chosen": -2.0341625213623047, "logits/rejected": -2.040132522583008, "logps/chosen": -1.6284903287887573, "logps/rejected": -4.28485107421875, "loss": 0.4774, "rewards/accuracies": 1.0, "rewards/chosen": 0.8124923706054688, "rewards/margins": 0.4913286566734314, "rewards/rejected": 0.32116371393203735, "step": 1581 }, { "epoch": 0.85, "learning_rate": 9.144794337837039e-08, "logits/chosen": -2.0867035388946533, "logits/rejected": -2.0879967212677, "logps/chosen": -0.6009854674339294, "logps/rejected": -3.484771728515625, "loss": 0.4948, "rewards/accuracies": 1.0, "rewards/chosen": 0.9565935134887695, "rewards/margins": 0.4460368752479553, "rewards/rejected": 0.5105566382408142, "step": 1582 }, { "epoch": 0.85, "learning_rate": 9.143572530875479e-08, "logits/chosen": -2.019399404525757, "logits/rejected": -2.0352590084075928, "logps/chosen": -2.210451126098633, "logps/rejected": -11.185972213745117, "loss": 0.5482, "rewards/accuracies": 1.0, "rewards/chosen": 1.1506805419921875, "rewards/margins": 0.3145914077758789, "rewards/rejected": 0.8360891342163086, "step": 1583 }, { "epoch": 0.85, "learning_rate": 9.142349933500031e-08, "logits/chosen": -2.107787609100342, "logits/rejected": -2.1781606674194336, "logps/chosen": -4.0446672439575195, "logps/rejected": -28.117422103881836, "loss": 0.4342, "rewards/accuracies": 1.0, "rewards/chosen": 1.0629488229751587, "rewards/margins": 0.6094081997871399, "rewards/rejected": 0.4535406231880188, "step": 1584 }, { "epoch": 0.85, "learning_rate": 9.141126545943911e-08, "logits/chosen": -2.009748697280884, "logits/rejected": -2.005795955657959, "logps/chosen": -3.7457327842712402, "logps/rejected": -4.615904808044434, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 1.443442940711975, "rewards/margins": 1.0248056650161743, "rewards/rejected": 0.4186372756958008, "step": 1585 }, { "epoch": 0.86, "learning_rate": 9.13990236844049e-08, "logits/chosen": -2.112630844116211, "logits/rejected": -2.2930376529693604, "logps/chosen": -1.2245503664016724, "logps/rejected": -1.2531425952911377, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9335057139396667, "rewards/margins": 0.017245054244995117, "rewards/rejected": 0.9162606596946716, "step": 1586 }, { "epoch": 0.86, "learning_rate": 9.138677401223287e-08, "logits/chosen": -2.0531840324401855, "logits/rejected": -2.2703676223754883, "logps/chosen": -1.0553364753723145, "logps/rejected": -1.0763989686965942, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.8647815585136414, "rewards/margins": 0.0077190399169921875, "rewards/rejected": 0.8570625185966492, "step": 1587 }, { "epoch": 0.86, "learning_rate": 9.137451644525973e-08, "logits/chosen": -2.079516649246216, "logits/rejected": -2.085236072540283, "logps/chosen": -4.153635025024414, "logps/rejected": -4.794942855834961, "loss": 0.4433, "rewards/accuracies": 1.0, "rewards/chosen": 1.2622461318969727, "rewards/margins": 0.5837326049804688, "rewards/rejected": 0.6785135269165039, "step": 1588 }, { "epoch": 0.86, "learning_rate": 9.136225098582369e-08, "logits/chosen": -2.0933051109313965, "logits/rejected": -2.2453417778015137, "logps/chosen": -1.6399548053741455, "logps/rejected": -1.648904800415039, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.9776332974433899, "rewards/margins": 0.046344876289367676, "rewards/rejected": 0.9312884211540222, "step": 1589 }, { "epoch": 0.86, "learning_rate": 9.134997763626447e-08, "logits/chosen": -2.033822774887085, "logits/rejected": -2.225480079650879, "logps/chosen": -1.39702308177948, "logps/rejected": -2.44901704788208, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 1.0037397146224976, "rewards/margins": -0.005717873573303223, "rewards/rejected": 1.0094575881958008, "step": 1590 }, { "epoch": 0.86, "learning_rate": 9.13376963989233e-08, "logits/chosen": -2.1576647758483887, "logits/rejected": -2.1617770195007324, "logps/chosen": -3.3892714977264404, "logps/rejected": -5.641064167022705, "loss": 0.3354, "rewards/accuracies": 1.0, "rewards/chosen": 1.5179091691970825, "rewards/margins": 0.9201479554176331, "rewards/rejected": 0.5977612137794495, "step": 1591 }, { "epoch": 0.86, "learning_rate": 9.132540727614289e-08, "logits/chosen": -2.0031745433807373, "logits/rejected": -2.3097448348999023, "logps/chosen": -9.093544006347656, "logps/rejected": -8.5230131149292, "loss": 0.7027, "rewards/accuracies": 0.0, "rewards/chosen": 0.4741500914096832, "rewards/margins": -0.019025713205337524, "rewards/rejected": 0.49317580461502075, "step": 1592 }, { "epoch": 0.86, "learning_rate": 9.13131102702675e-08, "logits/chosen": -2.1956024169921875, "logits/rejected": -2.134993314743042, "logps/chosen": -33.70378112792969, "logps/rejected": -5.117608547210693, "loss": 0.365, "rewards/accuracies": 1.0, "rewards/chosen": 1.479684829711914, "rewards/margins": 0.8196672797203064, "rewards/rejected": 0.6600175499916077, "step": 1593 }, { "epoch": 0.86, "learning_rate": 9.130080538364282e-08, "logits/chosen": -2.1142807006835938, "logits/rejected": -2.272130012512207, "logps/chosen": -3.6043262481689453, "logps/rejected": -3.4913206100463867, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9088878035545349, "rewards/margins": 0.017048001289367676, "rewards/rejected": 0.8918398022651672, "step": 1594 }, { "epoch": 0.86, "learning_rate": 9.128849261861613e-08, "logits/chosen": -2.1942405700683594, "logits/rejected": -2.1880478858947754, "logps/chosen": -2.161329507827759, "logps/rejected": -10.598796844482422, "loss": 0.3706, "rewards/accuracies": 1.0, "rewards/chosen": 1.3151942491531372, "rewards/margins": 0.8014874458312988, "rewards/rejected": 0.5137068033218384, "step": 1595 }, { "epoch": 0.86, "learning_rate": 9.127617197753615e-08, "logits/chosen": -2.1130926609039307, "logits/rejected": -2.310453414916992, "logps/chosen": -2.138070583343506, "logps/rejected": -2.2009239196777344, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9442384839057922, "rewards/margins": 0.02155524492263794, "rewards/rejected": 0.9226832389831543, "step": 1596 }, { "epoch": 0.86, "learning_rate": 9.126384346275313e-08, "logits/chosen": -2.0192620754241943, "logits/rejected": -2.0100369453430176, "logps/chosen": -6.670244216918945, "logps/rejected": -5.432431221008301, "loss": 0.4631, "rewards/accuracies": 1.0, "rewards/chosen": 1.0777047872543335, "rewards/margins": 0.5293570160865784, "rewards/rejected": 0.5483477711677551, "step": 1597 }, { "epoch": 0.86, "learning_rate": 9.125150707661882e-08, "logits/chosen": -2.1070868968963623, "logits/rejected": -2.1128180027008057, "logps/chosen": -2.0374279022216797, "logps/rejected": -1.7119040489196777, "loss": 0.4691, "rewards/accuracies": 1.0, "rewards/chosen": 1.1358413696289062, "rewards/margins": 0.5132039189338684, "rewards/rejected": 0.6226374506950378, "step": 1598 }, { "epoch": 0.86, "learning_rate": 9.123916282148647e-08, "logits/chosen": -2.000260591506958, "logits/rejected": -1.9913899898529053, "logps/chosen": -6.759612083435059, "logps/rejected": -4.664909362792969, "loss": 0.55, "rewards/accuracies": 1.0, "rewards/chosen": 1.0913265943527222, "rewards/margins": 0.3102537989616394, "rewards/rejected": 0.7810727953910828, "step": 1599 }, { "epoch": 0.86, "learning_rate": 9.12268106997108e-08, "logits/chosen": -2.1680550575256348, "logits/rejected": -2.061216115951538, "logps/chosen": -45.3266716003418, "logps/rejected": -2.3164658546447754, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 1.6507328748703003, "rewards/margins": 0.9144011735916138, "rewards/rejected": 0.7363317012786865, "step": 1600 }, { "epoch": 0.86, "learning_rate": 9.121445071364809e-08, "logits/chosen": -2.0381648540496826, "logits/rejected": -2.27301287651062, "logps/chosen": -7.789549827575684, "logps/rejected": -5.197845458984375, "loss": 0.7472, "rewards/accuracies": 0.0, "rewards/chosen": 0.9409169554710388, "rewards/margins": -0.1053127646446228, "rewards/rejected": 1.0462297201156616, "step": 1601 }, { "epoch": 0.86, "learning_rate": 9.12020828656561e-08, "logits/chosen": -2.091097116470337, "logits/rejected": -2.2729833126068115, "logps/chosen": -4.275428771972656, "logps/rejected": -4.1648478507995605, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.9265165328979492, "rewards/margins": 0.0019285082817077637, "rewards/rejected": 0.9245880246162415, "step": 1602 }, { "epoch": 0.86, "learning_rate": 9.118970715809402e-08, "logits/chosen": -2.002809524536133, "logits/rejected": -2.273796796798706, "logps/chosen": -3.009359836578369, "logps/rejected": -5.762357234954834, "loss": 0.7164, "rewards/accuracies": 0.0, "rewards/chosen": 0.8375757336616516, "rewards/margins": -0.046071648597717285, "rewards/rejected": 0.8836473822593689, "step": 1603 }, { "epoch": 0.87, "learning_rate": 9.117732359332267e-08, "logits/chosen": -2.067711353302002, "logits/rejected": -2.34220290184021, "logps/chosen": -21.807113647460938, "logps/rejected": -19.912900924682617, "loss": 0.5856, "rewards/accuracies": 1.0, "rewards/chosen": 0.04284477233886719, "rewards/margins": 0.22812901437282562, "rewards/rejected": -0.18528424203395844, "step": 1604 }, { "epoch": 0.87, "learning_rate": 9.116493217370425e-08, "logits/chosen": -2.0917599201202393, "logits/rejected": -2.285762310028076, "logps/chosen": -5.245703220367432, "logps/rejected": -3.938396692276001, "loss": 0.5999, "rewards/accuracies": 1.0, "rewards/chosen": 1.050492286682129, "rewards/margins": 0.19609689712524414, "rewards/rejected": 0.8543953895568848, "step": 1605 }, { "epoch": 0.87, "learning_rate": 9.115253290160252e-08, "logits/chosen": -2.0821011066436768, "logits/rejected": -2.077744722366333, "logps/chosen": -9.483257293701172, "logps/rejected": -3.5856759548187256, "loss": 0.3425, "rewards/accuracies": 1.0, "rewards/chosen": 1.454768419265747, "rewards/margins": 0.8953893780708313, "rewards/rejected": 0.5593790411949158, "step": 1606 }, { "epoch": 0.87, "learning_rate": 9.114012577938273e-08, "logits/chosen": -1.9846880435943604, "logits/rejected": -2.2475290298461914, "logps/chosen": -1.5795860290527344, "logps/rejected": -1.535370111465454, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.8472078442573547, "rewards/margins": 0.002811729907989502, "rewards/rejected": 0.8443961143493652, "step": 1607 }, { "epoch": 0.87, "learning_rate": 9.11277108094116e-08, "logits/chosen": -1.9912123680114746, "logits/rejected": -2.231926679611206, "logps/chosen": -1.6394104957580566, "logps/rejected": -1.681083083152771, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.9386526346206665, "rewards/margins": -0.001766502857208252, "rewards/rejected": 0.9404191374778748, "step": 1608 }, { "epoch": 0.87, "learning_rate": 9.11152879940574e-08, "logits/chosen": -2.1256139278411865, "logits/rejected": -2.264944553375244, "logps/chosen": -3.641603708267212, "logps/rejected": -3.701010227203369, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.853376030921936, "rewards/margins": 0.005043983459472656, "rewards/rejected": 0.8483320474624634, "step": 1609 }, { "epoch": 0.87, "learning_rate": 9.110285733568981e-08, "logits/chosen": -2.0370023250579834, "logits/rejected": -2.2631375789642334, "logps/chosen": -0.6655452251434326, "logps/rejected": -0.727762758731842, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9649237990379333, "rewards/margins": 0.017056941986083984, "rewards/rejected": 0.9478668570518494, "step": 1610 }, { "epoch": 0.87, "learning_rate": 9.109041883668012e-08, "logits/chosen": -2.1219210624694824, "logits/rejected": -2.3057305812835693, "logps/chosen": -4.075150489807129, "logps/rejected": -4.836980819702148, "loss": 0.7975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9794479608535767, "rewards/margins": -0.1988847255706787, "rewards/rejected": 1.1783326864242554, "step": 1611 }, { "epoch": 0.87, "learning_rate": 9.107797249940104e-08, "logits/chosen": -2.0482017993927, "logits/rejected": -2.325179100036621, "logps/chosen": -0.6108818650245667, "logps/rejected": -0.6451194286346436, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.7645879983901978, "rewards/margins": 0.01449960470199585, "rewards/rejected": 0.7500883936882019, "step": 1612 }, { "epoch": 0.87, "learning_rate": 9.106551832622677e-08, "logits/chosen": -2.1404497623443604, "logits/rejected": -2.3135063648223877, "logps/chosen": -0.7782426476478577, "logps/rejected": -0.7883164882659912, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 1.0249049663543701, "rewards/margins": 0.0042417049407958984, "rewards/rejected": 1.0206632614135742, "step": 1613 }, { "epoch": 0.87, "learning_rate": 9.105305631953303e-08, "logits/chosen": -2.036064863204956, "logits/rejected": -2.252122402191162, "logps/chosen": -1.7044342756271362, "logps/rejected": -1.7317211627960205, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.7869811058044434, "rewards/margins": -0.00797569751739502, "rewards/rejected": 0.7949568033218384, "step": 1614 }, { "epoch": 0.87, "learning_rate": 9.104058648169706e-08, "logits/chosen": -1.932940125465393, "logits/rejected": -2.215564727783203, "logps/chosen": -0.5778757333755493, "logps/rejected": -0.5500752925872803, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 0.8102983832359314, "rewards/margins": 0.035599470138549805, "rewards/rejected": 0.7746989130973816, "step": 1615 }, { "epoch": 0.87, "learning_rate": 9.102810881509752e-08, "logits/chosen": -2.191232204437256, "logits/rejected": -2.0241715908050537, "logps/chosen": -43.55302429199219, "logps/rejected": -2.6662702560424805, "loss": 0.2124, "rewards/accuracies": 1.0, "rewards/chosen": 2.0149524211883545, "rewards/margins": 1.441112995147705, "rewards/rejected": 0.5738393664360046, "step": 1616 }, { "epoch": 0.87, "learning_rate": 9.101562332211466e-08, "logits/chosen": -2.0823917388916016, "logits/rejected": -2.0848944187164307, "logps/chosen": -2.0091896057128906, "logps/rejected": -1.5340750217437744, "loss": 0.6548, "rewards/accuracies": 1.0, "rewards/chosen": 1.0781828165054321, "rewards/margins": 0.07825988531112671, "rewards/rejected": 0.9999229311943054, "step": 1617 }, { "epoch": 0.87, "learning_rate": 9.100313000513011e-08, "logits/chosen": -2.1377530097961426, "logits/rejected": -2.132387161254883, "logps/chosen": -3.0070698261260986, "logps/rejected": -2.8375236988067627, "loss": 0.5496, "rewards/accuracies": 1.0, "rewards/chosen": 1.0865144729614258, "rewards/margins": 0.31128233671188354, "rewards/rejected": 0.7752321362495422, "step": 1618 }, { "epoch": 0.87, "learning_rate": 9.09906288665271e-08, "logits/chosen": -2.006004810333252, "logits/rejected": -2.009995222091675, "logps/chosen": -1.9105026721954346, "logps/rejected": -4.772818565368652, "loss": 0.4109, "rewards/accuracies": 1.0, "rewards/chosen": 1.077026128768921, "rewards/margins": 0.6768338680267334, "rewards/rejected": 0.4001922607421875, "step": 1619 }, { "epoch": 0.87, "learning_rate": 9.097811990869029e-08, "logits/chosen": -2.0635101795196533, "logits/rejected": -2.2756686210632324, "logps/chosen": -0.5192890167236328, "logps/rejected": -0.5620659589767456, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.9337369799613953, "rewards/margins": 0.014641046524047852, "rewards/rejected": 0.9190959334373474, "step": 1620 }, { "epoch": 0.87, "learning_rate": 9.096560313400582e-08, "logits/chosen": -2.1358602046966553, "logits/rejected": -2.058032989501953, "logps/chosen": -40.16905975341797, "logps/rejected": -12.039374351501465, "loss": 0.2979, "rewards/accuracies": 1.0, "rewards/chosen": 1.724564790725708, "rewards/margins": 1.0582396984100342, "rewards/rejected": 0.6663250923156738, "step": 1621 }, { "epoch": 0.87, "learning_rate": 9.095307854486141e-08, "logits/chosen": -2.0437355041503906, "logits/rejected": -2.2812113761901855, "logps/chosen": -0.39013704657554626, "logps/rejected": -0.4833870232105255, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.8445653319358826, "rewards/margins": 0.018577218055725098, "rewards/rejected": 0.8259881138801575, "step": 1622 }, { "epoch": 0.88, "learning_rate": 9.094054614364616e-08, "logits/chosen": -1.9853744506835938, "logits/rejected": -1.9887845516204834, "logps/chosen": -4.563961982727051, "logps/rejected": -1.397141933441162, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 1.2125791311264038, "rewards/margins": 0.0014394521713256836, "rewards/rejected": 1.2111396789550781, "step": 1623 }, { "epoch": 0.88, "learning_rate": 9.092800593275072e-08, "logits/chosen": -2.1484885215759277, "logits/rejected": -2.3088507652282715, "logps/chosen": -6.006525039672852, "logps/rejected": -3.642843723297119, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": 0.6335176825523376, "rewards/margins": 0.06431072950363159, "rewards/rejected": 0.569206953048706, "step": 1624 }, { "epoch": 0.88, "learning_rate": 9.091545791456721e-08, "logits/chosen": -2.008556365966797, "logits/rejected": -2.007704019546509, "logps/chosen": -2.6094610691070557, "logps/rejected": -5.115635395050049, "loss": 0.3595, "rewards/accuracies": 1.0, "rewards/chosen": 1.445185661315918, "rewards/margins": 0.8379665613174438, "rewards/rejected": 0.6072190999984741, "step": 1625 }, { "epoch": 0.88, "learning_rate": 9.090290209148925e-08, "logits/chosen": -2.1498093605041504, "logits/rejected": -2.2770254611968994, "logps/chosen": -5.4896016120910645, "logps/rejected": -1.5654644966125488, "loss": 0.7349, "rewards/accuracies": 0.0, "rewards/chosen": 0.9349819421768188, "rewards/margins": -0.08181190490722656, "rewards/rejected": 1.0167938470840454, "step": 1626 }, { "epoch": 0.88, "learning_rate": 9.089033846591197e-08, "logits/chosen": -2.031951427459717, "logits/rejected": -2.239482879638672, "logps/chosen": -0.3410881459712982, "logps/rejected": -0.34140676259994507, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9356727600097656, "rewards/margins": 0.008866965770721436, "rewards/rejected": 0.9268057942390442, "step": 1627 }, { "epoch": 0.88, "learning_rate": 9.087776704023192e-08, "logits/chosen": -2.1457483768463135, "logits/rejected": -2.2870993614196777, "logps/chosen": -0.9984224438667297, "logps/rejected": -3.365426540374756, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 1.0059548616409302, "rewards/margins": 0.03485196828842163, "rewards/rejected": 0.9711028933525085, "step": 1628 }, { "epoch": 0.88, "learning_rate": 9.086518781684724e-08, "logits/chosen": -2.071712017059326, "logits/rejected": -2.2595183849334717, "logps/chosen": -0.4459441900253296, "logps/rejected": -0.3737286329269409, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.746658980846405, "rewards/margins": 0.023036479949951172, "rewards/rejected": 0.7236225008964539, "step": 1629 }, { "epoch": 0.88, "learning_rate": 9.085260079815744e-08, "logits/chosen": -2.140458583831787, "logits/rejected": -2.3230907917022705, "logps/chosen": -2.925447940826416, "logps/rejected": -2.785853385925293, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.8111026883125305, "rewards/margins": 0.010357975959777832, "rewards/rejected": 0.8007447123527527, "step": 1630 }, { "epoch": 0.88, "learning_rate": 9.084000598656363e-08, "logits/chosen": -2.0692243576049805, "logits/rejected": -2.2389471530914307, "logps/chosen": -3.2397279739379883, "logps/rejected": -3.1982550621032715, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.6408122181892395, "rewards/margins": 0.03703826665878296, "rewards/rejected": 0.6037739515304565, "step": 1631 }, { "epoch": 0.88, "learning_rate": 9.082740338446832e-08, "logits/chosen": -2.104449987411499, "logits/rejected": -2.2141458988189697, "logps/chosen": -1.302573561668396, "logps/rejected": -1.3256375789642334, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.8881421089172363, "rewards/margins": 0.029459476470947266, "rewards/rejected": 0.8586826324462891, "step": 1632 }, { "epoch": 0.88, "learning_rate": 9.081479299427555e-08, "logits/chosen": -2.081418752670288, "logits/rejected": -2.0908708572387695, "logps/chosen": -19.973011016845703, "logps/rejected": -6.310494422912598, "loss": 0.268, "rewards/accuracies": 1.0, "rewards/chosen": 1.905290961265564, "rewards/margins": 1.1797878742218018, "rewards/rejected": 0.7255030870437622, "step": 1633 }, { "epoch": 0.88, "learning_rate": 9.080217481839084e-08, "logits/chosen": -2.0294129848480225, "logits/rejected": -2.0229482650756836, "logps/chosen": -4.080356121063232, "logps/rejected": -4.509180068969727, "loss": 0.3951, "rewards/accuracies": 1.0, "rewards/chosen": 1.312463641166687, "rewards/margins": 0.7244646549224854, "rewards/rejected": 0.5879989862442017, "step": 1634 }, { "epoch": 0.88, "learning_rate": 9.078954885922119e-08, "logits/chosen": -2.0445003509521484, "logits/rejected": -2.293522596359253, "logps/chosen": -0.7496806383132935, "logps/rejected": -0.6458419561386108, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.9358939528465271, "rewards/margins": 0.03953295946121216, "rewards/rejected": 0.8963609933853149, "step": 1635 }, { "epoch": 0.88, "learning_rate": 9.077691511917506e-08, "logits/chosen": -2.1455366611480713, "logits/rejected": -2.261086940765381, "logps/chosen": -5.107229709625244, "logps/rejected": -0.6754746437072754, "loss": 0.7229, "rewards/accuracies": 0.0, "rewards/chosen": 0.8997978568077087, "rewards/margins": -0.05871635675430298, "rewards/rejected": 0.9585142135620117, "step": 1636 }, { "epoch": 0.88, "learning_rate": 9.076427360066246e-08, "logits/chosen": -2.000836133956909, "logits/rejected": -2.0066165924072266, "logps/chosen": -4.340358257293701, "logps/rejected": -5.105788230895996, "loss": 0.4823, "rewards/accuracies": 1.0, "rewards/chosen": 0.8811451196670532, "rewards/margins": 0.4782565236091614, "rewards/rejected": 0.40288859605789185, "step": 1637 }, { "epoch": 0.88, "learning_rate": 9.075162430609481e-08, "logits/chosen": -1.9319323301315308, "logits/rejected": -1.9387298822402954, "logps/chosen": -2.0342540740966797, "logps/rejected": -3.407496213912964, "loss": 0.5097, "rewards/accuracies": 1.0, "rewards/chosen": 0.8770595788955688, "rewards/margins": 0.40821582078933716, "rewards/rejected": 0.4688437581062317, "step": 1638 }, { "epoch": 0.88, "learning_rate": 9.073896723788508e-08, "logits/chosen": -2.0829966068267822, "logits/rejected": -2.086944341659546, "logps/chosen": -3.8968582153320312, "logps/rejected": -3.6301612854003906, "loss": 0.538, "rewards/accuracies": 1.0, "rewards/chosen": 1.1034457683563232, "rewards/margins": 0.3388392925262451, "rewards/rejected": 0.7646064758300781, "step": 1639 }, { "epoch": 0.88, "learning_rate": 9.072630239844765e-08, "logits/chosen": -2.114701986312866, "logits/rejected": -2.1118204593658447, "logps/chosen": -6.361594200134277, "logps/rejected": -3.1523780822753906, "loss": 0.4036, "rewards/accuracies": 1.0, "rewards/chosen": 1.3677719831466675, "rewards/margins": 0.6987505555152893, "rewards/rejected": 0.6690214276313782, "step": 1640 }, { "epoch": 0.89, "learning_rate": 9.071362979019846e-08, "logits/chosen": -2.0757718086242676, "logits/rejected": -2.2693405151367188, "logps/chosen": -10.197190284729004, "logps/rejected": -10.732946395874023, "loss": 0.7248, "rewards/accuracies": 0.0, "rewards/chosen": 0.8607361912727356, "rewards/margins": -0.06224071979522705, "rewards/rejected": 0.9229769110679626, "step": 1641 }, { "epoch": 0.89, "learning_rate": 9.070094941555487e-08, "logits/chosen": -2.035285234451294, "logits/rejected": -2.029599189758301, "logps/chosen": -6.3435187339782715, "logps/rejected": -6.219779014587402, "loss": 0.3947, "rewards/accuracies": 1.0, "rewards/chosen": 1.3424917459487915, "rewards/margins": 0.7257370948791504, "rewards/rejected": 0.6167546510696411, "step": 1642 }, { "epoch": 0.89, "learning_rate": 9.068826127693575e-08, "logits/chosen": -2.0018558502197266, "logits/rejected": -2.2351412773132324, "logps/chosen": -0.47107023000717163, "logps/rejected": -0.5536491870880127, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 0.8611486554145813, "rewards/margins": -0.012539207935333252, "rewards/rejected": 0.8736878633499146, "step": 1643 }, { "epoch": 0.89, "learning_rate": 9.067556537676144e-08, "logits/chosen": -2.050565242767334, "logits/rejected": -2.2809364795684814, "logps/chosen": -0.6368589401245117, "logps/rejected": -0.6562312841415405, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 1.013153076171875, "rewards/margins": 0.0037442445755004883, "rewards/rejected": 1.0094088315963745, "step": 1644 }, { "epoch": 0.89, "learning_rate": 9.066286171745379e-08, "logits/chosen": -2.1252803802490234, "logits/rejected": -2.1961569786071777, "logps/chosen": -0.7216636538505554, "logps/rejected": -0.7320334911346436, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9461261630058289, "rewards/margins": 0.008926153182983398, "rewards/rejected": 0.9372000098228455, "step": 1645 }, { "epoch": 0.89, "learning_rate": 9.065015030143607e-08, "logits/chosen": -2.13816499710083, "logits/rejected": -2.260953664779663, "logps/chosen": -1.460267424583435, "logps/rejected": -1.4972761869430542, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.6560196280479431, "rewards/margins": 0.005698800086975098, "rewards/rejected": 0.650320827960968, "step": 1646 }, { "epoch": 0.89, "learning_rate": 9.063743113113311e-08, "logits/chosen": -2.1944479942321777, "logits/rejected": -2.1968846321105957, "logps/chosen": -0.8079571723937988, "logps/rejected": -3.562753677368164, "loss": 0.5232, "rewards/accuracies": 1.0, "rewards/chosen": 1.0247976779937744, "rewards/margins": 0.3747260570526123, "rewards/rejected": 0.6500716209411621, "step": 1647 }, { "epoch": 0.89, "learning_rate": 9.062470420897115e-08, "logits/chosen": -2.1403017044067383, "logits/rejected": -2.1349735260009766, "logps/chosen": -4.818498611450195, "logps/rejected": -3.796525478363037, "loss": 0.6679, "rewards/accuracies": 1.0, "rewards/chosen": 0.7455722689628601, "rewards/margins": 0.05111658573150635, "rewards/rejected": 0.6944556832313538, "step": 1648 }, { "epoch": 0.89, "learning_rate": 9.061196953737794e-08, "logits/chosen": -1.9521656036376953, "logits/rejected": -1.9587723016738892, "logps/chosen": -3.241004705429077, "logps/rejected": -4.285340785980225, "loss": 0.4613, "rewards/accuracies": 1.0, "rewards/chosen": 1.1687393188476562, "rewards/margins": 0.5340844392776489, "rewards/rejected": 0.6346548795700073, "step": 1649 }, { "epoch": 0.89, "learning_rate": 9.059922711878269e-08, "logits/chosen": -2.0833492279052734, "logits/rejected": -2.089430093765259, "logps/chosen": -2.218153715133667, "logps/rejected": -3.5362260341644287, "loss": 0.4841, "rewards/accuracies": 1.0, "rewards/chosen": 1.1169966459274292, "rewards/margins": 0.4736056923866272, "rewards/rejected": 0.643390953540802, "step": 1650 }, { "epoch": 0.89, "learning_rate": 9.058647695561613e-08, "logits/chosen": -2.0493783950805664, "logits/rejected": -2.226322650909424, "logps/chosen": -4.4662580490112305, "logps/rejected": -3.551487684249878, "loss": 0.752, "rewards/accuracies": 0.0, "rewards/chosen": 0.7973334193229675, "rewards/margins": -0.11448192596435547, "rewards/rejected": 0.911815345287323, "step": 1651 }, { "epoch": 0.89, "learning_rate": 9.057371905031041e-08, "logits/chosen": -2.1551172733306885, "logits/rejected": -2.1569595336914062, "logps/chosen": -2.2403223514556885, "logps/rejected": -4.052532196044922, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 1.1193723678588867, "rewards/margins": 0.4384160041809082, "rewards/rejected": 0.6809563636779785, "step": 1652 }, { "epoch": 0.89, "learning_rate": 9.05609534052992e-08, "logits/chosen": -1.923886775970459, "logits/rejected": -2.23901629447937, "logps/chosen": -0.41635942459106445, "logps/rejected": -0.43597057461738586, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.96344393491745, "rewards/margins": 0.008991479873657227, "rewards/rejected": 0.9544524550437927, "step": 1653 }, { "epoch": 0.89, "learning_rate": 9.054818002301763e-08, "logits/chosen": -2.0880837440490723, "logits/rejected": -2.0871493816375732, "logps/chosen": -2.2057132720947266, "logps/rejected": -6.376780033111572, "loss": 0.4869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9020921587944031, "rewards/margins": 0.4664163887500763, "rewards/rejected": 0.4356757700443268, "step": 1654 }, { "epoch": 0.89, "learning_rate": 9.05353989059023e-08, "logits/chosen": -1.9567135572433472, "logits/rejected": -2.2140982151031494, "logps/chosen": -1.7075517177581787, "logps/rejected": -3.8738667964935303, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 1.000336766242981, "rewards/margins": 0.01938486099243164, "rewards/rejected": 0.9809519052505493, "step": 1655 }, { "epoch": 0.89, "learning_rate": 9.052261005639131e-08, "logits/chosen": -2.06190824508667, "logits/rejected": -2.062495708465576, "logps/chosen": -1.3116792440414429, "logps/rejected": -1.8409693241119385, "loss": 0.662, "rewards/accuracies": 1.0, "rewards/chosen": 0.7457146048545837, "rewards/margins": 0.06333011388778687, "rewards/rejected": 0.6823844909667969, "step": 1656 }, { "epoch": 0.89, "learning_rate": 9.05098134769242e-08, "logits/chosen": -2.0679922103881836, "logits/rejected": -2.3045244216918945, "logps/chosen": -1.009586215019226, "logps/rejected": -0.9575487375259399, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.8139687776565552, "rewards/margins": 0.010094940662384033, "rewards/rejected": 0.8038738369941711, "step": 1657 }, { "epoch": 0.89, "learning_rate": 9.049700916994198e-08, "logits/chosen": -2.135845184326172, "logits/rejected": -2.1076743602752686, "logps/chosen": -30.029699325561523, "logps/rejected": -4.02484130859375, "loss": 0.7811, "rewards/accuracies": 0.0, "rewards/chosen": 1.0105692148208618, "rewards/margins": -0.16883623600006104, "rewards/rejected": 1.1794054508209229, "step": 1658 }, { "epoch": 0.89, "learning_rate": 9.048419713788721e-08, "logits/chosen": -2.0213794708251953, "logits/rejected": -2.0880720615386963, "logps/chosen": -4.583895206451416, "logps/rejected": -23.5810604095459, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 1.4770910739898682, "rewards/margins": 1.3456379175186157, "rewards/rejected": 0.13145314157009125, "step": 1659 }, { "epoch": 0.9, "learning_rate": 9.047137738320384e-08, "logits/chosen": -1.9517747163772583, "logits/rejected": -1.9507644176483154, "logps/chosen": -1.6928123235702515, "logps/rejected": -2.0251917839050293, "loss": 0.6191, "rewards/accuracies": 1.0, "rewards/chosen": 1.0803625583648682, "rewards/margins": 0.1539684534072876, "rewards/rejected": 0.9263941049575806, "step": 1660 }, { "epoch": 0.9, "learning_rate": 9.045854990833732e-08, "logits/chosen": -2.1450936794281006, "logits/rejected": -2.13633131980896, "logps/chosen": -8.35951042175293, "logps/rejected": -0.7363818883895874, "loss": 0.4954, "rewards/accuracies": 1.0, "rewards/chosen": 1.4147214889526367, "rewards/margins": 0.44444072246551514, "rewards/rejected": 0.9702807664871216, "step": 1661 }, { "epoch": 0.9, "learning_rate": 9.044571471573459e-08, "logits/chosen": -2.022066354751587, "logits/rejected": -2.2399747371673584, "logps/chosen": -1.1232045888900757, "logps/rejected": -1.1302616596221924, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 0.8967970013618469, "rewards/margins": 0.04152095317840576, "rewards/rejected": 0.8552760481834412, "step": 1662 }, { "epoch": 0.9, "learning_rate": 9.043287180784405e-08, "logits/chosen": -2.1137030124664307, "logits/rejected": -2.243077516555786, "logps/chosen": -1.380097508430481, "logps/rejected": -1.439497470855713, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.8071188926696777, "rewards/margins": -0.0014719963073730469, "rewards/rejected": 0.8085908889770508, "step": 1663 }, { "epoch": 0.9, "learning_rate": 9.042002118711554e-08, "logits/chosen": -2.161536931991577, "logits/rejected": -2.1655797958374023, "logps/chosen": -0.7925145626068115, "logps/rejected": -4.8609747886657715, "loss": 0.4481, "rewards/accuracies": 1.0, "rewards/chosen": 1.0031622648239136, "rewards/margins": 0.5702483654022217, "rewards/rejected": 0.4329139292240143, "step": 1664 }, { "epoch": 0.9, "learning_rate": 9.040716285600043e-08, "logits/chosen": -2.0274412631988525, "logits/rejected": -2.0334529876708984, "logps/chosen": -1.8650078773498535, "logps/rejected": -2.505708694458008, "loss": 0.5204, "rewards/accuracies": 1.0, "rewards/chosen": 0.9945473074913025, "rewards/margins": 0.3817828297615051, "rewards/rejected": 0.6127644777297974, "step": 1665 }, { "epoch": 0.9, "learning_rate": 9.039429681695152e-08, "logits/chosen": -1.969037652015686, "logits/rejected": -2.214554786682129, "logps/chosen": -0.7065460085868835, "logps/rejected": -0.6929841041564941, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.9359650015830994, "rewards/margins": -0.0019176006317138672, "rewards/rejected": 0.9378826022148132, "step": 1666 }, { "epoch": 0.9, "learning_rate": 9.03814230724231e-08, "logits/chosen": -2.05277943611145, "logits/rejected": -2.255298137664795, "logps/chosen": -0.8912097215652466, "logps/rejected": -0.9172278642654419, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.7267983555793762, "rewards/margins": 0.027605175971984863, "rewards/rejected": 0.6991931796073914, "step": 1667 }, { "epoch": 0.9, "learning_rate": 9.036854162487089e-08, "logits/chosen": -2.020836591720581, "logits/rejected": -2.229515790939331, "logps/chosen": -2.7208192348480225, "logps/rejected": -2.759436845779419, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.7859007716178894, "rewards/margins": 0.009432196617126465, "rewards/rejected": 0.7764685750007629, "step": 1668 }, { "epoch": 0.9, "learning_rate": 9.035565247675216e-08, "logits/chosen": -2.019146203994751, "logits/rejected": -2.301179885864258, "logps/chosen": -6.9165940284729, "logps/rejected": -7.913525104522705, "loss": 0.6293, "rewards/accuracies": 1.0, "rewards/chosen": 0.47592511773109436, "rewards/margins": 0.13214188814163208, "rewards/rejected": 0.3437832295894623, "step": 1669 }, { "epoch": 0.9, "learning_rate": 9.03427556305256e-08, "logits/chosen": -2.052694082260132, "logits/rejected": -2.304490327835083, "logps/chosen": -0.6443431377410889, "logps/rejected": -5.5697407722473145, "loss": 0.5511, "rewards/accuracies": 1.0, "rewards/chosen": 0.9762789011001587, "rewards/margins": 0.3077143430709839, "rewards/rejected": 0.6685645580291748, "step": 1670 }, { "epoch": 0.9, "learning_rate": 9.032985108865132e-08, "logits/chosen": -2.0597641468048096, "logits/rejected": -2.0639281272888184, "logps/chosen": -2.8879647254943848, "logps/rejected": -4.731557846069336, "loss": 0.4844, "rewards/accuracies": 1.0, "rewards/chosen": 1.0022436380386353, "rewards/margins": 0.47279852628707886, "rewards/rejected": 0.5294451117515564, "step": 1671 }, { "epoch": 0.9, "learning_rate": 9.031693885359097e-08, "logits/chosen": -2.02177095413208, "logits/rejected": -2.0194203853607178, "logps/chosen": -1.4026000499725342, "logps/rejected": -4.232370376586914, "loss": 0.4802, "rewards/accuracies": 1.0, "rewards/chosen": 0.8988249897956848, "rewards/margins": 0.4839131534099579, "rewards/rejected": 0.41491183638572693, "step": 1672 }, { "epoch": 0.9, "learning_rate": 9.030401892780766e-08, "logits/chosen": -2.0509631633758545, "logits/rejected": -2.264427900314331, "logps/chosen": -2.2344541549682617, "logps/rejected": -2.2392332553863525, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.6179937720298767, "rewards/margins": 0.02779179811477661, "rewards/rejected": 0.5902019739151001, "step": 1673 }, { "epoch": 0.9, "learning_rate": 9.029109131376594e-08, "logits/chosen": -1.9982666969299316, "logits/rejected": -2.258089780807495, "logps/chosen": -0.39278674125671387, "logps/rejected": -0.3958316445350647, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 0.9741033911705017, "rewards/margins": 0.03760266304016113, "rewards/rejected": 0.9365007281303406, "step": 1674 }, { "epoch": 0.9, "learning_rate": 9.027815601393183e-08, "logits/chosen": -2.0931777954101562, "logits/rejected": -2.121692419052124, "logps/chosen": -4.724564552307129, "logps/rejected": -10.72514820098877, "loss": 0.3645, "rewards/accuracies": 1.0, "rewards/chosen": 1.4034148454666138, "rewards/margins": 0.8212966918945312, "rewards/rejected": 0.5821181535720825, "step": 1675 }, { "epoch": 0.9, "learning_rate": 9.026521303077284e-08, "logits/chosen": -2.0691301822662354, "logits/rejected": -2.0676982402801514, "logps/chosen": -0.8479499816894531, "logps/rejected": -3.4246575832366943, "loss": 0.5014, "rewards/accuracies": 1.0, "rewards/chosen": 0.9004648327827454, "rewards/margins": 0.4291752874851227, "rewards/rejected": 0.4712895452976227, "step": 1676 }, { "epoch": 0.9, "learning_rate": 9.025226236675791e-08, "logits/chosen": -2.0510141849517822, "logits/rejected": -2.050363063812256, "logps/chosen": -1.3779380321502686, "logps/rejected": -4.933806896209717, "loss": 0.4609, "rewards/accuracies": 1.0, "rewards/chosen": 1.0590333938598633, "rewards/margins": 0.5353720188140869, "rewards/rejected": 0.5236613750457764, "step": 1677 }, { "epoch": 0.91, "learning_rate": 9.023930402435749e-08, "logits/chosen": -1.924555778503418, "logits/rejected": -2.2474966049194336, "logps/chosen": -5.889647483825684, "logps/rejected": -6.149150848388672, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.4402157962322235, "rewards/margins": 0.00555574893951416, "rewards/rejected": 0.43466004729270935, "step": 1678 }, { "epoch": 0.91, "learning_rate": 9.022633800604345e-08, "logits/chosen": -2.092783212661743, "logits/rejected": -2.089261293411255, "logps/chosen": -6.050175189971924, "logps/rejected": -2.936767578125, "loss": 0.4582, "rewards/accuracies": 1.0, "rewards/chosen": 1.2646929025650024, "rewards/margins": 0.5426109433174133, "rewards/rejected": 0.7220819592475891, "step": 1679 }, { "epoch": 0.91, "learning_rate": 9.021336431428915e-08, "logits/chosen": -2.0939266681671143, "logits/rejected": -2.0160911083221436, "logps/chosen": -26.86905288696289, "logps/rejected": -2.5798661708831787, "loss": 0.3932, "rewards/accuracies": 1.0, "rewards/chosen": 1.350976586341858, "rewards/margins": 0.7302759289741516, "rewards/rejected": 0.6207006573677063, "step": 1680 }, { "epoch": 0.91, "learning_rate": 9.020038295156941e-08, "logits/chosen": -2.045102596282959, "logits/rejected": -2.045009136199951, "logps/chosen": -2.0635299682617188, "logps/rejected": -0.6945787072181702, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 0.8807512521743774, "rewards/margins": -0.0070209503173828125, "rewards/rejected": 0.8877722024917603, "step": 1681 }, { "epoch": 0.91, "learning_rate": 9.01873939203605e-08, "logits/chosen": -2.0779483318328857, "logits/rejected": -2.2362215518951416, "logps/chosen": -0.4303444027900696, "logps/rejected": -0.4562102258205414, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.870193600654602, "rewards/margins": 0.029291927814483643, "rewards/rejected": 0.8409016728401184, "step": 1682 }, { "epoch": 0.91, "learning_rate": 9.017439722314017e-08, "logits/chosen": -2.012254238128662, "logits/rejected": -2.0177371501922607, "logps/chosen": -2.1226677894592285, "logps/rejected": -4.206094264984131, "loss": 0.458, "rewards/accuracies": 1.0, "rewards/chosen": 0.9576221704483032, "rewards/margins": 0.5432119369506836, "rewards/rejected": 0.414410263299942, "step": 1683 }, { "epoch": 0.91, "learning_rate": 9.016139286238764e-08, "logits/chosen": -1.9801057577133179, "logits/rejected": -1.9812254905700684, "logps/chosen": -6.604016304016113, "logps/rejected": -0.4034518003463745, "loss": 0.476, "rewards/accuracies": 1.0, "rewards/chosen": 1.180530071258545, "rewards/margins": 0.495033860206604, "rewards/rejected": 0.6854962110519409, "step": 1684 }, { "epoch": 0.91, "learning_rate": 9.014838084058357e-08, "logits/chosen": -2.1548845767974854, "logits/rejected": -2.3179967403411865, "logps/chosen": -12.804471969604492, "logps/rejected": -14.700775146484375, "loss": 0.8464, "rewards/accuracies": 0.0, "rewards/chosen": 0.7294079065322876, "rewards/margins": -0.2860361337661743, "rewards/rejected": 1.015444040298462, "step": 1685 }, { "epoch": 0.91, "learning_rate": 9.013536116021008e-08, "logits/chosen": -2.047583818435669, "logits/rejected": -2.2453038692474365, "logps/chosen": -0.6735260486602783, "logps/rejected": -0.6745400428771973, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.8621594309806824, "rewards/margins": 0.011923670768737793, "rewards/rejected": 0.8502357602119446, "step": 1686 }, { "epoch": 0.91, "learning_rate": 9.012233382375075e-08, "logits/chosen": -2.072197437286377, "logits/rejected": -2.3346331119537354, "logps/chosen": -7.604787826538086, "logps/rejected": -16.548810958862305, "loss": 0.5821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9170904159545898, "rewards/margins": 0.23606282472610474, "rewards/rejected": 0.6810275912284851, "step": 1687 }, { "epoch": 0.91, "learning_rate": 9.010929883369066e-08, "logits/chosen": -1.9754761457443237, "logits/rejected": -1.9779257774353027, "logps/chosen": -1.0452089309692383, "logps/rejected": -5.584008693695068, "loss": 0.5471, "rewards/accuracies": 1.0, "rewards/chosen": 0.8044760823249817, "rewards/margins": 0.31702733039855957, "rewards/rejected": 0.4874487519264221, "step": 1688 }, { "epoch": 0.91, "learning_rate": 9.009625619251631e-08, "logits/chosen": -2.047086715698242, "logits/rejected": -2.0484492778778076, "logps/chosen": -0.6858469843864441, "logps/rejected": -3.7532248497009277, "loss": 0.5258, "rewards/accuracies": 1.0, "rewards/chosen": 0.9482154250144958, "rewards/margins": 0.36846327781677246, "rewards/rejected": 0.5797521471977234, "step": 1689 }, { "epoch": 0.91, "learning_rate": 9.008320590271565e-08, "logits/chosen": -1.9838018417358398, "logits/rejected": -1.99178946018219, "logps/chosen": -3.908156156539917, "logps/rejected": -1.3005475997924805, "loss": 0.6688, "rewards/accuracies": 1.0, "rewards/chosen": 1.0229500532150269, "rewards/margins": 0.0493280291557312, "rewards/rejected": 0.9736220240592957, "step": 1690 }, { "epoch": 0.91, "learning_rate": 9.007014796677814e-08, "logits/chosen": -2.0280933380126953, "logits/rejected": -2.2737560272216797, "logps/chosen": -2.3534493446350098, "logps/rejected": -6.369187355041504, "loss": 0.6143, "rewards/accuracies": 1.0, "rewards/chosen": 1.1667848825454712, "rewards/margins": 0.1644841432571411, "rewards/rejected": 1.00230073928833, "step": 1691 }, { "epoch": 0.91, "learning_rate": 9.005708238719465e-08, "logits/chosen": -2.151639223098755, "logits/rejected": -2.3136706352233887, "logps/chosen": -0.6693345904350281, "logps/rejected": -0.6436893343925476, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9512031674385071, "rewards/margins": 0.022458255290985107, "rewards/rejected": 0.928744912147522, "step": 1692 }, { "epoch": 0.91, "learning_rate": 9.004400916645754e-08, "logits/chosen": -2.0142459869384766, "logits/rejected": -2.0199601650238037, "logps/chosen": -2.3064591884613037, "logps/rejected": -3.889322280883789, "loss": 0.4755, "rewards/accuracies": 1.0, "rewards/chosen": 1.0786151885986328, "rewards/margins": 0.49627214670181274, "rewards/rejected": 0.5823430418968201, "step": 1693 }, { "epoch": 0.91, "learning_rate": 9.003092830706058e-08, "logits/chosen": -2.0713415145874023, "logits/rejected": -2.099890947341919, "logps/chosen": -22.605077743530273, "logps/rejected": -13.71103572845459, "loss": 0.2013, "rewards/accuracies": 1.0, "rewards/chosen": 1.938104510307312, "rewards/margins": 1.5006818771362305, "rewards/rejected": 0.43742266297340393, "step": 1694 }, { "epoch": 0.91, "learning_rate": 9.001783981149906e-08, "logits/chosen": -2.041043758392334, "logits/rejected": -2.033186435699463, "logps/chosen": -5.259522914886475, "logps/rejected": -3.5941500663757324, "loss": 0.4445, "rewards/accuracies": 1.0, "rewards/chosen": 1.684574007987976, "rewards/margins": 0.5803079605102539, "rewards/rejected": 1.1042660474777222, "step": 1695 }, { "epoch": 0.91, "learning_rate": 9.000474368226971e-08, "logits/chosen": -1.9634695053100586, "logits/rejected": -1.9700002670288086, "logps/chosen": -5.300222396850586, "logps/rejected": -3.7646145820617676, "loss": 0.2817, "rewards/accuracies": 1.0, "rewards/chosen": 1.654628038406372, "rewards/margins": 1.12261962890625, "rewards/rejected": 0.5320084095001221, "step": 1696 }, { "epoch": 0.92, "learning_rate": 8.999163992187067e-08, "logits/chosen": -2.0867438316345215, "logits/rejected": -2.319013833999634, "logps/chosen": -1.377921462059021, "logps/rejected": -1.2705121040344238, "loss": 0.668, "rewards/accuracies": 1.0, "rewards/chosen": 1.0014232397079468, "rewards/margins": 0.05104464292526245, "rewards/rejected": 0.9503785967826843, "step": 1697 }, { "epoch": 0.92, "learning_rate": 8.997852853280157e-08, "logits/chosen": -2.129643201828003, "logits/rejected": -2.2622387409210205, "logps/chosen": -1.8358560800552368, "logps/rejected": -1.8941408395767212, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9580218195915222, "rewards/margins": 0.01300811767578125, "rewards/rejected": 0.945013701915741, "step": 1698 }, { "epoch": 0.92, "learning_rate": 8.996540951756353e-08, "logits/chosen": -2.037768840789795, "logits/rejected": -2.2752819061279297, "logps/chosen": -1.0736682415008545, "logps/rejected": -1.0349370241165161, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.7998098731040955, "rewards/margins": 0.006554365158081055, "rewards/rejected": 0.7932555079460144, "step": 1699 }, { "epoch": 0.92, "learning_rate": 8.995228287865905e-08, "logits/chosen": -2.1029675006866455, "logits/rejected": -2.2801451683044434, "logps/chosen": -4.671213150024414, "logps/rejected": -4.4100165367126465, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.9230117797851562, "rewards/margins": -0.007628262042999268, "rewards/rejected": 0.9306400418281555, "step": 1700 }, { "epoch": 0.92, "learning_rate": 8.993914861859215e-08, "logits/chosen": -2.039437770843506, "logits/rejected": -2.26021146774292, "logps/chosen": -0.42239272594451904, "logps/rejected": -0.4320700466632843, "loss": 0.6706, "rewards/accuracies": 1.0, "rewards/chosen": 0.9630052447319031, "rewards/margins": 0.04557204246520996, "rewards/rejected": 0.9174332022666931, "step": 1701 }, { "epoch": 0.92, "learning_rate": 8.992600673986826e-08, "logits/chosen": -2.025144338607788, "logits/rejected": -2.0310051441192627, "logps/chosen": -2.1889636516571045, "logps/rejected": -3.450939178466797, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 1.0579028129577637, "rewards/margins": 0.433013916015625, "rewards/rejected": 0.6248888969421387, "step": 1702 }, { "epoch": 0.92, "learning_rate": 8.991285724499429e-08, "logits/chosen": -2.043396234512329, "logits/rejected": -2.037041425704956, "logps/chosen": -9.886907577514648, "logps/rejected": -3.3302371501922607, "loss": 0.3967, "rewards/accuracies": 1.0, "rewards/chosen": 1.3757826089859009, "rewards/margins": 0.7198159098625183, "rewards/rejected": 0.6559666991233826, "step": 1703 }, { "epoch": 0.92, "learning_rate": 8.98997001364786e-08, "logits/chosen": -2.0949645042419434, "logits/rejected": -2.1016037464141846, "logps/chosen": -2.030003786087036, "logps/rejected": -2.749340772628784, "loss": 0.4767, "rewards/accuracies": 1.0, "rewards/chosen": 1.180557370185852, "rewards/margins": 0.4931570887565613, "rewards/rejected": 0.6874002814292908, "step": 1704 }, { "epoch": 0.92, "learning_rate": 8.988653541683097e-08, "logits/chosen": -2.109389543533325, "logits/rejected": -2.2944445610046387, "logps/chosen": -3.8235645294189453, "logps/rejected": -3.7811176776885986, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.7610440254211426, "rewards/margins": 0.005582690238952637, "rewards/rejected": 0.7554613351821899, "step": 1705 }, { "epoch": 0.92, "learning_rate": 8.987336308856269e-08, "logits/chosen": -2.174901247024536, "logits/rejected": -2.2763280868530273, "logps/chosen": -0.48017174005508423, "logps/rejected": -0.5076714754104614, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.8168727159500122, "rewards/margins": 0.007384598255157471, "rewards/rejected": 0.8094881176948547, "step": 1706 }, { "epoch": 0.92, "learning_rate": 8.986018315418641e-08, "logits/chosen": -1.9450078010559082, "logits/rejected": -2.2567386627197266, "logps/chosen": -2.803762912750244, "logps/rejected": -1.6611011028289795, "loss": 0.7696, "rewards/accuracies": 0.0, "rewards/chosen": 0.6528792977333069, "rewards/margins": -0.14739298820495605, "rewards/rejected": 0.8002722859382629, "step": 1707 }, { "epoch": 0.92, "learning_rate": 8.984699561621638e-08, "logits/chosen": -2.0677101612091064, "logits/rejected": -2.0624964237213135, "logps/chosen": -6.535760402679443, "logps/rejected": -3.304459571838379, "loss": 0.4774, "rewards/accuracies": 1.0, "rewards/chosen": 1.209455132484436, "rewards/margins": 0.49122154712677, "rewards/rejected": 0.718233585357666, "step": 1708 }, { "epoch": 0.92, "learning_rate": 8.983380047716815e-08, "logits/chosen": -1.9461842775344849, "logits/rejected": -1.9565417766571045, "logps/chosen": -6.920497894287109, "logps/rejected": -5.232658863067627, "loss": 0.3683, "rewards/accuracies": 1.0, "rewards/chosen": 1.4119840860366821, "rewards/margins": 0.8091690540313721, "rewards/rejected": 0.6028150320053101, "step": 1709 }, { "epoch": 0.92, "learning_rate": 8.982059773955877e-08, "logits/chosen": -2.082266092300415, "logits/rejected": -2.0937721729278564, "logps/chosen": -1.188056230545044, "logps/rejected": -11.275418281555176, "loss": 0.6315, "rewards/accuracies": 1.0, "rewards/chosen": 0.9586674571037292, "rewards/margins": 0.1273435354232788, "rewards/rejected": 0.8313239216804504, "step": 1710 }, { "epoch": 0.92, "learning_rate": 8.980738740590678e-08, "logits/chosen": -1.9241050481796265, "logits/rejected": -1.9076482057571411, "logps/chosen": -9.021830558776855, "logps/rejected": -1.2372719049453735, "loss": 0.6058, "rewards/accuracies": 1.0, "rewards/chosen": 1.2415200471878052, "rewards/margins": 0.18306314945220947, "rewards/rejected": 1.0584568977355957, "step": 1711 }, { "epoch": 0.92, "learning_rate": 8.979416947873215e-08, "logits/chosen": -2.0068507194519043, "logits/rejected": -2.014843463897705, "logps/chosen": -1.9292707443237305, "logps/rejected": -2.6049304008483887, "loss": 0.4377, "rewards/accuracies": 1.0, "rewards/chosen": 1.2421354055404663, "rewards/margins": 0.5995072722434998, "rewards/rejected": 0.6426281332969666, "step": 1712 }, { "epoch": 0.92, "learning_rate": 8.978094396055625e-08, "logits/chosen": -1.9932137727737427, "logits/rejected": -2.3091623783111572, "logps/chosen": -0.8797861337661743, "logps/rejected": -0.9340291619300842, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.8671402931213379, "rewards/margins": 0.032266438007354736, "rewards/rejected": 0.8348738551139832, "step": 1713 }, { "epoch": 0.92, "learning_rate": 8.976771085390195e-08, "logits/chosen": -2.0971436500549316, "logits/rejected": -2.092482328414917, "logps/chosen": -3.735177993774414, "logps/rejected": -1.9693289995193481, "loss": 0.3674, "rewards/accuracies": 1.0, "rewards/chosen": 1.4772939682006836, "rewards/margins": 0.8118715286254883, "rewards/rejected": 0.6654224395751953, "step": 1714 }, { "epoch": 0.93, "learning_rate": 8.975447016129357e-08, "logits/chosen": -2.0515100955963135, "logits/rejected": -2.0495691299438477, "logps/chosen": -1.804702639579773, "logps/rejected": -4.279053688049316, "loss": 0.4902, "rewards/accuracies": 1.0, "rewards/chosen": 1.0500214099884033, "rewards/margins": 0.4578220844268799, "rewards/rejected": 0.5921993255615234, "step": 1715 }, { "epoch": 0.93, "learning_rate": 8.974122188525683e-08, "logits/chosen": -2.177947759628296, "logits/rejected": -2.175297260284424, "logps/chosen": -8.027739524841309, "logps/rejected": -0.716175377368927, "loss": 0.6135, "rewards/accuracies": 1.0, "rewards/chosen": 0.860511302947998, "rewards/margins": 0.16624152660369873, "rewards/rejected": 0.6942697763442993, "step": 1716 }, { "epoch": 0.93, "learning_rate": 8.972796602831896e-08, "logits/chosen": -1.9424726963043213, "logits/rejected": -1.9436019659042358, "logps/chosen": -1.7856444120407104, "logps/rejected": -0.9447658658027649, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 1.0605764389038086, "rewards/margins": 0.13255518674850464, "rewards/rejected": 0.928021252155304, "step": 1717 }, { "epoch": 0.93, "learning_rate": 8.971470259300856e-08, "logits/chosen": -2.0692903995513916, "logits/rejected": -2.0664095878601074, "logps/chosen": -6.362905502319336, "logps/rejected": -4.517873287200928, "loss": 0.3438, "rewards/accuracies": 1.0, "rewards/chosen": 1.5837172269821167, "rewards/margins": 0.8907211422920227, "rewards/rejected": 0.692996084690094, "step": 1718 }, { "epoch": 0.93, "learning_rate": 8.970143158185575e-08, "logits/chosen": -2.0309929847717285, "logits/rejected": -2.2614645957946777, "logps/chosen": -0.4479826092720032, "logps/rejected": -0.4464667737483978, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.8802412152290344, "rewards/margins": 0.02150094509124756, "rewards/rejected": 0.8587402701377869, "step": 1719 }, { "epoch": 0.93, "learning_rate": 8.968815299739205e-08, "logits/chosen": -1.947330355644226, "logits/rejected": -1.9066473245620728, "logps/chosen": -15.524734497070312, "logps/rejected": -8.111166954040527, "loss": 0.3948, "rewards/accuracies": 1.0, "rewards/chosen": 1.161083459854126, "rewards/margins": 0.7254258394241333, "rewards/rejected": 0.4356575906276703, "step": 1720 }, { "epoch": 0.93, "learning_rate": 8.967486684215047e-08, "logits/chosen": -1.9983150959014893, "logits/rejected": -2.004732131958008, "logps/chosen": -8.375876426696777, "logps/rejected": -1.6979560852050781, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 1.2056790590286255, "rewards/margins": 0.13864636421203613, "rewards/rejected": 1.0670326948165894, "step": 1721 }, { "epoch": 0.93, "learning_rate": 8.966157311866538e-08, "logits/chosen": -2.038727045059204, "logits/rejected": -2.032931327819824, "logps/chosen": -8.385148048400879, "logps/rejected": -7.472899913787842, "loss": 0.4596, "rewards/accuracies": 1.0, "rewards/chosen": 1.0561686754226685, "rewards/margins": 0.5386773943901062, "rewards/rejected": 0.5174912810325623, "step": 1722 }, { "epoch": 0.93, "learning_rate": 8.964827182947267e-08, "logits/chosen": -2.078772783279419, "logits/rejected": -2.268958568572998, "logps/chosen": -0.8112444281578064, "logps/rejected": -0.7448791861534119, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9755150079727173, "rewards/margins": -3.421306610107422e-05, "rewards/rejected": 0.9755492210388184, "step": 1723 }, { "epoch": 0.93, "learning_rate": 8.963496297710966e-08, "logits/chosen": -2.0188770294189453, "logits/rejected": -2.029604196548462, "logps/chosen": -8.408193588256836, "logps/rejected": -3.100324869155884, "loss": 0.3876, "rewards/accuracies": 1.0, "rewards/chosen": 1.524782419204712, "rewards/margins": 0.7478583455085754, "rewards/rejected": 0.7769240736961365, "step": 1724 }, { "epoch": 0.93, "learning_rate": 8.962164656411508e-08, "logits/chosen": -2.1210999488830566, "logits/rejected": -2.2306127548217773, "logps/chosen": -0.6056355237960815, "logps/rejected": -0.5783569812774658, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.9733540415763855, "rewards/margins": 0.03192633390426636, "rewards/rejected": 0.9414277076721191, "step": 1725 }, { "epoch": 0.93, "learning_rate": 8.960832259302913e-08, "logits/chosen": -1.9959921836853027, "logits/rejected": -2.0053462982177734, "logps/chosen": -1.8223750591278076, "logps/rejected": -2.7169196605682373, "loss": 0.5206, "rewards/accuracies": 1.0, "rewards/chosen": 1.0444674491882324, "rewards/margins": 0.38113677501678467, "rewards/rejected": 0.6633306741714478, "step": 1726 }, { "epoch": 0.93, "learning_rate": 8.959499106639348e-08, "logits/chosen": -1.9931024312973022, "logits/rejected": -1.993776798248291, "logps/chosen": -0.9824819564819336, "logps/rejected": -2.8722593784332275, "loss": 0.5691, "rewards/accuracies": 1.0, "rewards/chosen": 0.9583532214164734, "rewards/margins": 0.26566147804260254, "rewards/rejected": 0.6926917433738708, "step": 1727 }, { "epoch": 0.93, "learning_rate": 8.958165198675114e-08, "logits/chosen": -2.0901598930358887, "logits/rejected": -2.3441946506500244, "logps/chosen": -2.585519552230835, "logps/rejected": -2.7187869548797607, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.6379043459892273, "rewards/margins": 0.03870266675949097, "rewards/rejected": 0.5992016792297363, "step": 1728 }, { "epoch": 0.93, "learning_rate": 8.956830535664668e-08, "logits/chosen": -2.043410062789917, "logits/rejected": -2.0373382568359375, "logps/chosen": -6.829435348510742, "logps/rejected": -0.7014720439910889, "loss": 0.3079, "rewards/accuracies": 1.0, "rewards/chosen": 1.9512939453125, "rewards/margins": 1.0200190544128418, "rewards/rejected": 0.9312748908996582, "step": 1729 }, { "epoch": 0.93, "learning_rate": 8.955495117862606e-08, "logits/chosen": -2.1205532550811768, "logits/rejected": -2.2967185974121094, "logps/chosen": -1.747605800628662, "logps/rejected": -1.7919979095458984, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.9557611346244812, "rewards/margins": -0.004136800765991211, "rewards/rejected": 0.9598979353904724, "step": 1730 }, { "epoch": 0.93, "learning_rate": 8.954158945523664e-08, "logits/chosen": -2.015197515487671, "logits/rejected": -2.286733388900757, "logps/chosen": -0.44889792799949646, "logps/rejected": -0.46561044454574585, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.7386693358421326, "rewards/margins": -0.0023077726364135742, "rewards/rejected": 0.7409771084785461, "step": 1731 }, { "epoch": 0.93, "learning_rate": 8.95282201890273e-08, "logits/chosen": -1.9175176620483398, "logits/rejected": -2.2477214336395264, "logps/chosen": -4.621048450469971, "logps/rejected": -4.231186866760254, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.6484261155128479, "rewards/margins": -0.00048804283142089844, "rewards/rejected": 0.6489141583442688, "step": 1732 }, { "epoch": 0.93, "learning_rate": 8.951484338254829e-08, "logits/chosen": -2.0190017223358154, "logits/rejected": -2.024785041809082, "logps/chosen": -1.746709942817688, "logps/rejected": -2.3102598190307617, "loss": 0.5005, "rewards/accuracies": 1.0, "rewards/chosen": 1.051106333732605, "rewards/margins": 0.43153542280197144, "rewards/rejected": 0.6195709109306335, "step": 1733 }, { "epoch": 0.94, "learning_rate": 8.950145903835131e-08, "logits/chosen": -2.114034414291382, "logits/rejected": -2.2559869289398193, "logps/chosen": -3.545165538787842, "logps/rejected": -3.5369997024536133, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.8728542327880859, "rewards/margins": 0.023411095142364502, "rewards/rejected": 0.8494431376457214, "step": 1734 }, { "epoch": 0.94, "learning_rate": 8.948806715898956e-08, "logits/chosen": -2.047905683517456, "logits/rejected": -2.2420785427093506, "logps/chosen": -0.4664381444454193, "logps/rejected": -0.4787387251853943, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.7431901097297668, "rewards/margins": 0.01027214527130127, "rewards/rejected": 0.7329179644584656, "step": 1735 }, { "epoch": 0.94, "learning_rate": 8.947466774701757e-08, "logits/chosen": -2.0233216285705566, "logits/rejected": -2.285888195037842, "logps/chosen": -4.999838829040527, "logps/rejected": -4.841273784637451, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.4794800877571106, "rewards/margins": -0.012199074029922485, "rewards/rejected": 0.4916791617870331, "step": 1736 }, { "epoch": 0.94, "learning_rate": 8.94612608049914e-08, "logits/chosen": -1.9982093572616577, "logits/rejected": -1.9983816146850586, "logps/chosen": -2.030272960662842, "logps/rejected": -2.1602678298950195, "loss": 0.5604, "rewards/accuracies": 1.0, "rewards/chosen": 1.2259085178375244, "rewards/margins": 0.2857971787452698, "rewards/rejected": 0.9401113390922546, "step": 1737 }, { "epoch": 0.94, "learning_rate": 8.944784633546853e-08, "logits/chosen": -2.104376792907715, "logits/rejected": -2.2179348468780518, "logps/chosen": -1.8709661960601807, "logps/rejected": -4.889904499053955, "loss": 0.6407, "rewards/accuracies": 1.0, "rewards/chosen": 0.8762998580932617, "rewards/margins": 0.10779309272766113, "rewards/rejected": 0.7685067653656006, "step": 1738 }, { "epoch": 0.94, "learning_rate": 8.943442434100784e-08, "logits/chosen": -2.025379180908203, "logits/rejected": -2.1911051273345947, "logps/chosen": -5.953829765319824, "logps/rejected": -1.0982415676116943, "loss": 0.8231, "rewards/accuracies": 0.0, "rewards/chosen": 0.6181803941726685, "rewards/margins": -0.24490058422088623, "rewards/rejected": 0.8630809783935547, "step": 1739 }, { "epoch": 0.94, "learning_rate": 8.942099482416964e-08, "logits/chosen": -1.9625463485717773, "logits/rejected": -2.2245280742645264, "logps/chosen": -10.274738311767578, "logps/rejected": -9.953340530395508, "loss": 0.7023, "rewards/accuracies": 0.0, "rewards/chosen": 0.32439976930618286, "rewards/margins": -0.018310636281967163, "rewards/rejected": 0.34271040558815, "step": 1740 }, { "epoch": 0.94, "learning_rate": 8.940755778751574e-08, "logits/chosen": -2.047637939453125, "logits/rejected": -2.267707109451294, "logps/chosen": -1.3628392219543457, "logps/rejected": -1.4408735036849976, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9382578134536743, "rewards/margins": 0.005741775035858154, "rewards/rejected": 0.9325160384178162, "step": 1741 }, { "epoch": 0.94, "learning_rate": 8.939411323360933e-08, "logits/chosen": -2.1090006828308105, "logits/rejected": -2.1108529567718506, "logps/chosen": -0.5232328772544861, "logps/rejected": -3.7765605449676514, "loss": 0.4701, "rewards/accuracies": 1.0, "rewards/chosen": 1.0274049043655396, "rewards/margins": 0.510572075843811, "rewards/rejected": 0.5168328285217285, "step": 1742 }, { "epoch": 0.94, "learning_rate": 8.938066116501504e-08, "logits/chosen": -2.071441173553467, "logits/rejected": -2.167480707168579, "logps/chosen": -1.6435368061065674, "logps/rejected": -26.746477127075195, "loss": 0.2679, "rewards/accuracies": 1.0, "rewards/chosen": 1.1087017059326172, "rewards/margins": 1.1800254583358765, "rewards/rejected": -0.07132377475500107, "step": 1743 }, { "epoch": 0.94, "learning_rate": 8.936720158429894e-08, "logits/chosen": -2.0330166816711426, "logits/rejected": -2.2700774669647217, "logps/chosen": -1.0163875818252563, "logps/rejected": -0.9795167446136475, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.8918753862380981, "rewards/margins": 0.027514934539794922, "rewards/rejected": 0.8643604516983032, "step": 1744 }, { "epoch": 0.94, "learning_rate": 8.935373449402855e-08, "logits/chosen": -2.0670666694641113, "logits/rejected": -2.066817045211792, "logps/chosen": -1.3874201774597168, "logps/rejected": -2.1856093406677246, "loss": 0.6453, "rewards/accuracies": 1.0, "rewards/chosen": 1.0645716190338135, "rewards/margins": 0.09801030158996582, "rewards/rejected": 0.9665613174438477, "step": 1745 }, { "epoch": 0.94, "learning_rate": 8.934025989677279e-08, "logits/chosen": -2.1927742958068848, "logits/rejected": -2.111143112182617, "logps/chosen": -26.551671981811523, "logps/rejected": -1.9783042669296265, "loss": 0.3024, "rewards/accuracies": 1.0, "rewards/chosen": 1.7281172275543213, "rewards/margins": 1.0408122539520264, "rewards/rejected": 0.6873050332069397, "step": 1746 }, { "epoch": 0.94, "learning_rate": 8.932677779510202e-08, "logits/chosen": -2.206132173538208, "logits/rejected": -2.037243604660034, "logps/chosen": -55.94173812866211, "logps/rejected": -12.018352508544922, "loss": 0.3322, "rewards/accuracies": 1.0, "rewards/chosen": 1.8121922016143799, "rewards/margins": 0.9314558506011963, "rewards/rejected": 0.8807363510131836, "step": 1747 }, { "epoch": 0.94, "learning_rate": 8.931328819158808e-08, "logits/chosen": -2.103806495666504, "logits/rejected": -2.194708824157715, "logps/chosen": -2.4452991485595703, "logps/rejected": -2.39919114112854, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9309083819389343, "rewards/margins": 0.01660841703414917, "rewards/rejected": 0.9142999649047852, "step": 1748 }, { "epoch": 0.94, "learning_rate": 8.929979108880416e-08, "logits/chosen": -2.0820746421813965, "logits/rejected": -2.0829081535339355, "logps/chosen": -1.5538017749786377, "logps/rejected": -4.479108810424805, "loss": 0.4717, "rewards/accuracies": 1.0, "rewards/chosen": 0.994268536567688, "rewards/margins": 0.5064016580581665, "rewards/rejected": 0.4878668785095215, "step": 1749 }, { "epoch": 0.94, "learning_rate": 8.928628648932495e-08, "logits/chosen": -1.981157898902893, "logits/rejected": -1.988427996635437, "logps/chosen": -1.9781752824783325, "logps/rejected": -2.500999689102173, "loss": 0.5351, "rewards/accuracies": 1.0, "rewards/chosen": 1.0062603950500488, "rewards/margins": 0.3457425832748413, "rewards/rejected": 0.6605178117752075, "step": 1750 }, { "epoch": 0.94, "learning_rate": 8.927277439572651e-08, "logits/chosen": -2.037998676300049, "logits/rejected": -2.2236227989196777, "logps/chosen": -0.30603253841400146, "logps/rejected": -0.3471490740776062, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.9552372097969055, "rewards/margins": 0.027518928050994873, "rewards/rejected": 0.9277182817459106, "step": 1751 }, { "epoch": 0.94, "learning_rate": 8.925925481058639e-08, "logits/chosen": -2.0291976928710938, "logits/rejected": -2.2519938945770264, "logps/chosen": -1.0597419738769531, "logps/rejected": -1.1098394393920898, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.7836991548538208, "rewards/margins": 0.023948311805725098, "rewards/rejected": 0.7597508430480957, "step": 1752 }, { "epoch": 0.95, "learning_rate": 8.924572773648353e-08, "logits/chosen": -2.0471019744873047, "logits/rejected": -2.0485451221466064, "logps/chosen": -2.2979466915130615, "logps/rejected": -1.6041309833526611, "loss": 0.6232, "rewards/accuracies": 1.0, "rewards/chosen": 1.121084213256836, "rewards/margins": 0.1450709104537964, "rewards/rejected": 0.9760133028030396, "step": 1753 }, { "epoch": 0.95, "learning_rate": 8.923219317599833e-08, "logits/chosen": -2.0439672470092773, "logits/rejected": -2.2939116954803467, "logps/chosen": -2.903475284576416, "logps/rejected": -4.26884126663208, "loss": 0.6161, "rewards/accuracies": 1.0, "rewards/chosen": 0.5636085867881775, "rewards/margins": 0.16049659252166748, "rewards/rejected": 0.40311199426651, "step": 1754 }, { "epoch": 0.95, "learning_rate": 8.921865113171256e-08, "logits/chosen": -2.1811418533325195, "logits/rejected": -2.254289150238037, "logps/chosen": -4.513854026794434, "logps/rejected": -26.3050537109375, "loss": 0.4109, "rewards/accuracies": 1.0, "rewards/chosen": 1.071444034576416, "rewards/margins": 0.6770256757736206, "rewards/rejected": 0.394418329000473, "step": 1755 }, { "epoch": 0.95, "learning_rate": 8.920510160620947e-08, "logits/chosen": -2.1086809635162354, "logits/rejected": -1.9375284910202026, "logps/chosen": -41.486061096191406, "logps/rejected": -2.7642383575439453, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 1.5745381116867065, "rewards/margins": 0.889665961265564, "rewards/rejected": 0.6848721504211426, "step": 1756 }, { "epoch": 0.95, "learning_rate": 8.919154460207371e-08, "logits/chosen": -2.192068338394165, "logits/rejected": -2.1993277072906494, "logps/chosen": -1.854885458946228, "logps/rejected": -2.9892866611480713, "loss": 0.522, "rewards/accuracies": 1.0, "rewards/chosen": 0.9703200459480286, "rewards/margins": 0.3778020143508911, "rewards/rejected": 0.5925180315971375, "step": 1757 }, { "epoch": 0.95, "learning_rate": 8.91779801218914e-08, "logits/chosen": -2.081712007522583, "logits/rejected": -2.08647084236145, "logps/chosen": -6.828949928283691, "logps/rejected": -8.422203063964844, "loss": 0.3066, "rewards/accuracies": 1.0, "rewards/chosen": 1.502610683441162, "rewards/margins": 1.0249882936477661, "rewards/rejected": 0.4776224195957184, "step": 1758 }, { "epoch": 0.95, "learning_rate": 8.916440816825003e-08, "logits/chosen": -1.995651125907898, "logits/rejected": -2.202991247177124, "logps/chosen": -2.0993807315826416, "logps/rejected": -2.1651744842529297, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.9266495108604431, "rewards/margins": 0.018916547298431396, "rewards/rejected": 0.9077329635620117, "step": 1759 }, { "epoch": 0.95, "learning_rate": 8.915082874373855e-08, "logits/chosen": -2.0120041370391846, "logits/rejected": -2.237508535385132, "logps/chosen": -9.55699634552002, "logps/rejected": -6.211907863616943, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8029868006706238, "rewards/margins": 0.02239304780960083, "rewards/rejected": 0.780593752861023, "step": 1760 }, { "epoch": 0.95, "learning_rate": 8.913724185094732e-08, "logits/chosen": -2.0436158180236816, "logits/rejected": -2.103395462036133, "logps/chosen": -9.586009979248047, "logps/rejected": -7.891464710235596, "loss": 0.5694, "rewards/accuracies": 1.0, "rewards/chosen": 1.2455543279647827, "rewards/margins": 0.26504015922546387, "rewards/rejected": 0.9805141687393188, "step": 1761 }, { "epoch": 0.95, "learning_rate": 8.912364749246812e-08, "logits/chosen": -2.1265604496002197, "logits/rejected": -2.300461530685425, "logps/chosen": -0.5407437086105347, "logps/rejected": -0.5355252623558044, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8473510146141052, "rewards/margins": 0.01953202486038208, "rewards/rejected": 0.8278189897537231, "step": 1762 }, { "epoch": 0.95, "learning_rate": 8.911004567089419e-08, "logits/chosen": -2.0285065174102783, "logits/rejected": -2.278978109359741, "logps/chosen": -2.5113673210144043, "logps/rejected": -2.3330588340759277, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 1.172998070716858, "rewards/margins": 0.0319058895111084, "rewards/rejected": 1.1410921812057495, "step": 1763 }, { "epoch": 0.95, "learning_rate": 8.909643638882015e-08, "logits/chosen": -2.1438517570495605, "logits/rejected": -2.143922805786133, "logps/chosen": -3.3227181434631348, "logps/rejected": -3.092489719390869, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": 1.6570539474487305, "rewards/margins": 1.0516064167022705, "rewards/rejected": 0.60544753074646, "step": 1764 }, { "epoch": 0.95, "learning_rate": 8.908281964884206e-08, "logits/chosen": -2.0003647804260254, "logits/rejected": -1.9958808422088623, "logps/chosen": -3.095139503479004, "logps/rejected": -4.954375267028809, "loss": 0.5821, "rewards/accuracies": 1.0, "rewards/chosen": 1.0734626054763794, "rewards/margins": 0.23604470491409302, "rewards/rejected": 0.8374179005622864, "step": 1765 }, { "epoch": 0.95, "learning_rate": 8.90691954535574e-08, "logits/chosen": -2.155880928039551, "logits/rejected": -2.1170010566711426, "logps/chosen": -24.599411010742188, "logps/rejected": -3.9023895263671875, "loss": 0.3711, "rewards/accuracies": 1.0, "rewards/chosen": 1.3761875629425049, "rewards/margins": 0.8000510334968567, "rewards/rejected": 0.5761365294456482, "step": 1766 }, { "epoch": 0.95, "learning_rate": 8.905556380556509e-08, "logits/chosen": -2.0323824882507324, "logits/rejected": -2.291642427444458, "logps/chosen": -0.8135073184967041, "logps/rejected": -0.8377029895782471, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9150751233100891, "rewards/margins": 0.014275729656219482, "rewards/rejected": 0.9007993936538696, "step": 1767 }, { "epoch": 0.95, "learning_rate": 8.904192470746545e-08, "logits/chosen": -2.1972525119781494, "logits/rejected": -2.2488582134246826, "logps/chosen": -19.87291717529297, "logps/rejected": -9.997535705566406, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": 0.8789235949516296, "rewards/margins": 0.06860005855560303, "rewards/rejected": 0.8103235363960266, "step": 1768 }, { "epoch": 0.95, "learning_rate": 8.902827816186022e-08, "logits/chosen": -2.155224084854126, "logits/rejected": -2.069053888320923, "logps/chosen": -24.25074005126953, "logps/rejected": -2.981842279434204, "loss": 0.2185, "rewards/accuracies": 1.0, "rewards/chosen": 1.971778154373169, "rewards/margins": 1.4097237586975098, "rewards/rejected": 0.562054455280304, "step": 1769 }, { "epoch": 0.95, "learning_rate": 8.901462417135259e-08, "logits/chosen": -1.9649473428726196, "logits/rejected": -1.965834617614746, "logps/chosen": -4.689169883728027, "logps/rejected": -3.3291285037994385, "loss": 0.2915, "rewards/accuracies": 1.0, "rewards/chosen": 1.6377826929092407, "rewards/margins": 1.0834763050079346, "rewards/rejected": 0.5543064475059509, "step": 1770 }, { "epoch": 0.96, "learning_rate": 8.900096273854711e-08, "logits/chosen": -2.096118927001953, "logits/rejected": -2.1627163887023926, "logps/chosen": -16.96392822265625, "logps/rejected": -12.665145874023438, "loss": 0.5447, "rewards/accuracies": 1.0, "rewards/chosen": 1.3047064542770386, "rewards/margins": 0.3227236270904541, "rewards/rejected": 0.9819828271865845, "step": 1771 }, { "epoch": 0.96, "learning_rate": 8.898729386604983e-08, "logits/chosen": -1.9774720668792725, "logits/rejected": -1.9845318794250488, "logps/chosen": -2.0944294929504395, "logps/rejected": -2.7500874996185303, "loss": 0.4914, "rewards/accuracies": 1.0, "rewards/chosen": 1.0400505065917969, "rewards/margins": 0.4547887444496155, "rewards/rejected": 0.5852617621421814, "step": 1772 }, { "epoch": 0.96, "learning_rate": 8.897361755646815e-08, "logits/chosen": -2.138570785522461, "logits/rejected": -2.133622169494629, "logps/chosen": -3.284057140350342, "logps/rejected": -3.0059967041015625, "loss": 0.413, "rewards/accuracies": 1.0, "rewards/chosen": 1.4785435199737549, "rewards/margins": 0.6707702279090881, "rewards/rejected": 0.8077732920646667, "step": 1773 }, { "epoch": 0.96, "learning_rate": 8.895993381241093e-08, "logits/chosen": -2.1056928634643555, "logits/rejected": -2.35103440284729, "logps/chosen": -0.35320279002189636, "logps/rejected": -0.35003727674484253, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.7549110651016235, "rewards/margins": 0.026724934577941895, "rewards/rejected": 0.7281861305236816, "step": 1774 }, { "epoch": 0.96, "learning_rate": 8.894624263648845e-08, "logits/chosen": -2.0202903747558594, "logits/rejected": -2.0196950435638428, "logps/chosen": -0.5377929210662842, "logps/rejected": -3.6362266540527344, "loss": 0.5603, "rewards/accuracies": 1.0, "rewards/chosen": 0.8972625732421875, "rewards/margins": 0.2860835790634155, "rewards/rejected": 0.611178994178772, "step": 1775 }, { "epoch": 0.96, "learning_rate": 8.893254403131237e-08, "logits/chosen": -1.968285083770752, "logits/rejected": -2.1491949558258057, "logps/chosen": -0.5676701068878174, "logps/rejected": -0.6312063932418823, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9914502501487732, "rewards/margins": 0.021663188934326172, "rewards/rejected": 0.969787061214447, "step": 1776 }, { "epoch": 0.96, "learning_rate": 8.891883799949579e-08, "logits/chosen": -2.122204542160034, "logits/rejected": -2.3071300983428955, "logps/chosen": -2.5021274089813232, "logps/rejected": -2.3648478984832764, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.8075386881828308, "rewards/margins": 0.007453620433807373, "rewards/rejected": 0.8000850677490234, "step": 1777 }, { "epoch": 0.96, "learning_rate": 8.890512454365322e-08, "logits/chosen": -2.009768009185791, "logits/rejected": -2.0110795497894287, "logps/chosen": -0.9541516304016113, "logps/rejected": -3.1668801307678223, "loss": 0.4956, "rewards/accuracies": 1.0, "rewards/chosen": 1.1325390338897705, "rewards/margins": 0.4439769387245178, "rewards/rejected": 0.6885620951652527, "step": 1778 }, { "epoch": 0.96, "learning_rate": 8.889140366640062e-08, "logits/chosen": -2.0492899417877197, "logits/rejected": -2.0403873920440674, "logps/chosen": -13.187639236450195, "logps/rejected": -3.590937376022339, "loss": 0.4594, "rewards/accuracies": 1.0, "rewards/chosen": 1.0360370874404907, "rewards/margins": 0.539273738861084, "rewards/rejected": 0.49676331877708435, "step": 1779 }, { "epoch": 0.96, "learning_rate": 8.88776753703553e-08, "logits/chosen": -2.046776533126831, "logits/rejected": -2.296706438064575, "logps/chosen": -8.871349334716797, "logps/rejected": -10.343753814697266, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8768644332885742, "rewards/margins": 0.022200942039489746, "rewards/rejected": 0.8546634912490845, "step": 1780 }, { "epoch": 0.96, "learning_rate": 8.886393965813607e-08, "logits/chosen": -2.0476832389831543, "logits/rejected": -2.2607712745666504, "logps/chosen": -0.811590313911438, "logps/rejected": -0.7672699093818665, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.9153215289115906, "rewards/margins": 0.020026206970214844, "rewards/rejected": 0.8952953219413757, "step": 1781 }, { "epoch": 0.96, "learning_rate": 8.885019653236308e-08, "logits/chosen": -2.0008673667907715, "logits/rejected": -2.0064029693603516, "logps/chosen": -1.6875196695327759, "logps/rejected": -3.1477060317993164, "loss": 0.5157, "rewards/accuracies": 1.0, "rewards/chosen": 0.9528532028198242, "rewards/margins": 0.3932109475135803, "rewards/rejected": 0.5596422553062439, "step": 1782 }, { "epoch": 0.96, "learning_rate": 8.883644599565792e-08, "logits/chosen": -2.2138631343841553, "logits/rejected": -2.107079267501831, "logps/chosen": -32.622764587402344, "logps/rejected": -4.43997859954834, "loss": 0.2176, "rewards/accuracies": 1.0, "rewards/chosen": 1.9057514667510986, "rewards/margins": 1.4141793251037598, "rewards/rejected": 0.4915721118450165, "step": 1783 }, { "epoch": 0.96, "learning_rate": 8.882268805064362e-08, "logits/chosen": -2.0348243713378906, "logits/rejected": -2.0396714210510254, "logps/chosen": -11.050111770629883, "logps/rejected": -4.426734924316406, "loss": 0.6643, "rewards/accuracies": 1.0, "rewards/chosen": 0.854417622089386, "rewards/margins": 0.0586169958114624, "rewards/rejected": 0.7958006262779236, "step": 1784 }, { "epoch": 0.96, "learning_rate": 8.880892269994454e-08, "logits/chosen": -2.0378119945526123, "logits/rejected": -2.2822558879852295, "logps/chosen": -0.3834248185157776, "logps/rejected": -0.4779440760612488, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.8839201331138611, "rewards/margins": 0.018572449684143066, "rewards/rejected": 0.865347683429718, "step": 1785 }, { "epoch": 0.96, "learning_rate": 8.879514994618658e-08, "logits/chosen": -2.071321725845337, "logits/rejected": -2.2722668647766113, "logps/chosen": -4.260043144226074, "logps/rejected": -1.4127448797225952, "loss": 0.7433, "rewards/accuracies": 0.0, "rewards/chosen": 0.616639256477356, "rewards/margins": -0.09793752431869507, "rewards/rejected": 0.714576780796051, "step": 1786 }, { "epoch": 0.96, "learning_rate": 8.878136979199695e-08, "logits/chosen": -2.096782684326172, "logits/rejected": -2.0682060718536377, "logps/chosen": -13.128716468811035, "logps/rejected": -1.901106834411621, "loss": 0.3532, "rewards/accuracies": 1.0, "rewards/chosen": 1.5468500852584839, "rewards/margins": 0.8588089346885681, "rewards/rejected": 0.6880411505699158, "step": 1787 }, { "epoch": 0.96, "learning_rate": 8.876758224000431e-08, "logits/chosen": -2.041912317276001, "logits/rejected": -2.0404231548309326, "logps/chosen": -2.933825731277466, "logps/rejected": -4.475748062133789, "loss": 0.3222, "rewards/accuracies": 1.0, "rewards/chosen": 1.4604321718215942, "rewards/margins": 0.9671366810798645, "rewards/rejected": 0.49329549074172974, "step": 1788 }, { "epoch": 0.96, "learning_rate": 8.875378729283873e-08, "logits/chosen": -1.9893625974655151, "logits/rejected": -1.988968014717102, "logps/chosen": -0.2901555299758911, "logps/rejected": -4.504409313201904, "loss": 0.4922, "rewards/accuracies": 1.0, "rewards/chosen": 0.9372078776359558, "rewards/margins": 0.4526197612285614, "rewards/rejected": 0.4845881164073944, "step": 1789 }, { "epoch": 0.97, "learning_rate": 8.873998495313168e-08, "logits/chosen": -2.0173332691192627, "logits/rejected": -2.258652687072754, "logps/chosen": -0.9426173567771912, "logps/rejected": -1.0096948146820068, "loss": 0.6621, "rewards/accuracies": 1.0, "rewards/chosen": 0.9640825390815735, "rewards/margins": 0.06315672397613525, "rewards/rejected": 0.9009258151054382, "step": 1790 }, { "epoch": 0.97, "learning_rate": 8.872617522351607e-08, "logits/chosen": -2.1296746730804443, "logits/rejected": -2.114828586578369, "logps/chosen": -9.093395233154297, "logps/rejected": -5.911407470703125, "loss": 0.3955, "rewards/accuracies": 1.0, "rewards/chosen": 1.3312686681747437, "rewards/margins": 0.7232062816619873, "rewards/rejected": 0.6080623865127563, "step": 1791 }, { "epoch": 0.97, "learning_rate": 8.871235810662617e-08, "logits/chosen": -2.041215181350708, "logits/rejected": -2.1402106285095215, "logps/chosen": -3.556914806365967, "logps/rejected": -10.046818733215332, "loss": 0.5268, "rewards/accuracies": 1.0, "rewards/chosen": 1.2127599716186523, "rewards/margins": 0.3660603165626526, "rewards/rejected": 0.8466996550559998, "step": 1792 }, { "epoch": 0.97, "learning_rate": 8.86985336050977e-08, "logits/chosen": -2.1132888793945312, "logits/rejected": -2.031466007232666, "logps/chosen": -16.015541076660156, "logps/rejected": -4.3191914558410645, "loss": 0.4283, "rewards/accuracies": 1.0, "rewards/chosen": 1.3606475591659546, "rewards/margins": 0.6260553598403931, "rewards/rejected": 0.7345921993255615, "step": 1793 }, { "epoch": 0.97, "learning_rate": 8.868470172156777e-08, "logits/chosen": -2.0457684993743896, "logits/rejected": -2.175006866455078, "logps/chosen": -0.7438490390777588, "logps/rejected": -0.5631361603736877, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.7449265718460083, "rewards/margins": 0.0013408660888671875, "rewards/rejected": 0.7435857057571411, "step": 1794 }, { "epoch": 0.97, "learning_rate": 8.867086245867491e-08, "logits/chosen": -2.1976308822631836, "logits/rejected": -2.246619701385498, "logps/chosen": -6.8065080642700195, "logps/rejected": -17.62061309814453, "loss": 0.3603, "rewards/accuracies": 1.0, "rewards/chosen": 1.2904711961746216, "rewards/margins": 0.8353030681610107, "rewards/rejected": 0.4551681578159332, "step": 1795 }, { "epoch": 0.97, "learning_rate": 8.865701581905905e-08, "logits/chosen": -2.045790910720825, "logits/rejected": -2.1171610355377197, "logps/chosen": -2.8835763931274414, "logps/rejected": -19.597923278808594, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 1.2911876440048218, "rewards/margins": 0.28008079528808594, "rewards/rejected": 1.0111068487167358, "step": 1796 }, { "epoch": 0.97, "learning_rate": 8.86431618053615e-08, "logits/chosen": -1.967559576034546, "logits/rejected": -1.9648398160934448, "logps/chosen": -3.2420248985290527, "logps/rejected": -3.7139203548431396, "loss": 0.3891, "rewards/accuracies": 1.0, "rewards/chosen": 1.3367174863815308, "rewards/margins": 0.7431487441062927, "rewards/rejected": 0.593568742275238, "step": 1797 }, { "epoch": 0.97, "learning_rate": 8.862930042022507e-08, "logits/chosen": -2.0016565322875977, "logits/rejected": -2.2966809272766113, "logps/chosen": -0.6576706171035767, "logps/rejected": -0.7342870831489563, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 1.0394381284713745, "rewards/margins": 0.04574817419052124, "rewards/rejected": 0.9936899542808533, "step": 1798 }, { "epoch": 0.97, "learning_rate": 8.861543166629384e-08, "logits/chosen": -1.989499568939209, "logits/rejected": -2.243853807449341, "logps/chosen": -0.6111470460891724, "logps/rejected": -0.6105438470840454, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.8271713256835938, "rewards/margins": -0.006439149379730225, "rewards/rejected": 0.833610475063324, "step": 1799 }, { "epoch": 0.97, "learning_rate": 8.860155554621342e-08, "logits/chosen": -2.1495893001556396, "logits/rejected": -2.2234435081481934, "logps/chosen": -6.226284027099609, "logps/rejected": -16.66171646118164, "loss": 0.5194, "rewards/accuracies": 1.0, "rewards/chosen": 1.0301223993301392, "rewards/margins": 0.38411158323287964, "rewards/rejected": 0.6460108160972595, "step": 1800 }, { "epoch": 0.97, "learning_rate": 8.858767206263074e-08, "logits/chosen": -2.107264995574951, "logits/rejected": -2.2776849269866943, "logps/chosen": -0.6993376016616821, "logps/rejected": -0.721696138381958, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 0.9806235432624817, "rewards/margins": 0.03565406799316406, "rewards/rejected": 0.9449694752693176, "step": 1801 }, { "epoch": 0.97, "learning_rate": 8.857378121819416e-08, "logits/chosen": -2.155104398727417, "logits/rejected": -2.1594457626342773, "logps/chosen": -2.94560170173645, "logps/rejected": -3.5969161987304688, "loss": 0.426, "rewards/accuracies": 1.0, "rewards/chosen": 1.1978901624679565, "rewards/margins": 0.6326839923858643, "rewards/rejected": 0.5652061700820923, "step": 1802 }, { "epoch": 0.97, "learning_rate": 8.855988301555349e-08, "logits/chosen": -2.1403517723083496, "logits/rejected": -2.2751457691192627, "logps/chosen": -2.8247640132904053, "logps/rejected": -0.7476545572280884, "loss": 0.7201, "rewards/accuracies": 0.0, "rewards/chosen": 0.8351237177848816, "rewards/margins": -0.05323982238769531, "rewards/rejected": 0.8883635401725769, "step": 1803 }, { "epoch": 0.97, "learning_rate": 8.854597745735987e-08, "logits/chosen": -2.030095100402832, "logits/rejected": -2.245570182800293, "logps/chosen": -1.4839303493499756, "logps/rejected": -45.86762237548828, "loss": 0.2596, "rewards/accuracies": 1.0, "rewards/chosen": 0.8594006896018982, "rewards/margins": 1.2161054611206055, "rewards/rejected": -0.3567047119140625, "step": 1804 }, { "epoch": 0.97, "learning_rate": 8.853206454626589e-08, "logits/chosen": -2.0576210021972656, "logits/rejected": -2.058103322982788, "logps/chosen": -2.715846538543701, "logps/rejected": -1.159216284751892, "loss": 0.6124, "rewards/accuracies": 1.0, "rewards/chosen": 1.0934826135635376, "rewards/margins": 0.16859310865402222, "rewards/rejected": 0.9248895049095154, "step": 1805 }, { "epoch": 0.97, "learning_rate": 8.851814428492553e-08, "logits/chosen": -2.072157859802246, "logits/rejected": -2.3352444171905518, "logps/chosen": -2.491450309753418, "logps/rejected": -2.2752413749694824, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 1.0476794242858887, "rewards/margins": 0.04792642593383789, "rewards/rejected": 0.9997529983520508, "step": 1806 }, { "epoch": 0.97, "learning_rate": 8.850421667599416e-08, "logits/chosen": -1.9553450345993042, "logits/rejected": -1.9617879390716553, "logps/chosen": -2.0158166885375977, "logps/rejected": -4.165088176727295, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": 1.093645453453064, "rewards/margins": 0.06556892395019531, "rewards/rejected": 1.0280765295028687, "step": 1807 }, { "epoch": 0.98, "learning_rate": 8.849028172212859e-08, "logits/chosen": -2.05319881439209, "logits/rejected": -2.0485117435455322, "logps/chosen": -11.808234214782715, "logps/rejected": -5.7435994148254395, "loss": 0.2937, "rewards/accuracies": 1.0, "rewards/chosen": 1.6613247394561768, "rewards/margins": 1.0747907161712646, "rewards/rejected": 0.5865340828895569, "step": 1808 }, { "epoch": 0.98, "learning_rate": 8.847633942598698e-08, "logits/chosen": -1.9603983163833618, "logits/rejected": -2.2515435218811035, "logps/chosen": -0.46955859661102295, "logps/rejected": -0.4668274521827698, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.9363951086997986, "rewards/margins": 0.0022228360176086426, "rewards/rejected": 0.9341722726821899, "step": 1809 }, { "epoch": 0.98, "learning_rate": 8.846238979022892e-08, "logits/chosen": -2.009223699569702, "logits/rejected": -2.004077434539795, "logps/chosen": -6.8785223960876465, "logps/rejected": -4.187273979187012, "loss": 0.312, "rewards/accuracies": 1.0, "rewards/chosen": 1.565250277519226, "rewards/margins": 1.0046930313110352, "rewards/rejected": 0.5605573058128357, "step": 1810 }, { "epoch": 0.98, "learning_rate": 8.844843281751539e-08, "logits/chosen": -2.071746587753296, "logits/rejected": -2.0771915912628174, "logps/chosen": -2.3413424491882324, "logps/rejected": -4.08708381652832, "loss": 0.5064, "rewards/accuracies": 1.0, "rewards/chosen": 1.0564554929733276, "rewards/margins": 0.4164885878562927, "rewards/rejected": 0.6399669051170349, "step": 1811 }, { "epoch": 0.98, "learning_rate": 8.843446851050881e-08, "logits/chosen": -2.2364959716796875, "logits/rejected": -2.1941208839416504, "logps/chosen": -40.63759231567383, "logps/rejected": -11.595552444458008, "loss": 0.2911, "rewards/accuracies": 1.0, "rewards/chosen": 1.8460148572921753, "rewards/margins": 1.0849950313568115, "rewards/rejected": 0.7610198855400085, "step": 1812 }, { "epoch": 0.98, "learning_rate": 8.842049687187292e-08, "logits/chosen": -2.1585702896118164, "logits/rejected": -2.2978947162628174, "logps/chosen": -2.4135141372680664, "logps/rejected": -1.71756112575531, "loss": 0.7672, "rewards/accuracies": 0.0, "rewards/chosen": 0.8379685282707214, "rewards/margins": -0.1430233120918274, "rewards/rejected": 0.9809918403625488, "step": 1813 }, { "epoch": 0.98, "learning_rate": 8.840651790427292e-08, "logits/chosen": -2.0304532051086426, "logits/rejected": -2.2623445987701416, "logps/chosen": -0.8442473411560059, "logps/rejected": -0.8882438540458679, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.9903751611709595, "rewards/margins": 0.03193771839141846, "rewards/rejected": 0.958437442779541, "step": 1814 }, { "epoch": 0.98, "learning_rate": 8.839253161037539e-08, "logits/chosen": -2.0193634033203125, "logits/rejected": -2.314847707748413, "logps/chosen": -0.9174942970275879, "logps/rejected": -5.32022762298584, "loss": 0.6283, "rewards/accuracies": 1.0, "rewards/chosen": 1.0313173532485962, "rewards/margins": 0.13416451215744019, "rewards/rejected": 0.897152841091156, "step": 1815 }, { "epoch": 0.98, "learning_rate": 8.837853799284831e-08, "logits/chosen": -2.042422294616699, "logits/rejected": -2.2998695373535156, "logps/chosen": -0.19068333506584167, "logps/rejected": -0.1943439543247223, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.8576241731643677, "rewards/margins": 0.023426711559295654, "rewards/rejected": 0.834197461605072, "step": 1816 }, { "epoch": 0.98, "learning_rate": 8.836453705436105e-08, "logits/chosen": -2.116856098175049, "logits/rejected": -2.275388240814209, "logps/chosen": -0.9997924566268921, "logps/rejected": -1.0977305173873901, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.9874266982078552, "rewards/margins": -0.0006852149963378906, "rewards/rejected": 0.9881119132041931, "step": 1817 }, { "epoch": 0.98, "learning_rate": 8.83505287975844e-08, "logits/chosen": -2.221468210220337, "logits/rejected": -2.1411542892456055, "logps/chosen": -24.635496139526367, "logps/rejected": -3.6034414768218994, "loss": 0.3409, "rewards/accuracies": 1.0, "rewards/chosen": 1.5918505191802979, "rewards/margins": 0.9007889032363892, "rewards/rejected": 0.6910616159439087, "step": 1818 }, { "epoch": 0.98, "learning_rate": 8.83365132251905e-08, "logits/chosen": -2.1463358402252197, "logits/rejected": -2.321566343307495, "logps/chosen": -0.384469211101532, "logps/rejected": -0.4017637372016907, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9358764886856079, "rewards/margins": 0.014269649982452393, "rewards/rejected": 0.9216068387031555, "step": 1819 }, { "epoch": 0.98, "learning_rate": 8.832249033985293e-08, "logits/chosen": -1.9514392614364624, "logits/rejected": -1.9500257968902588, "logps/chosen": -7.867130756378174, "logps/rejected": -3.26870059967041, "loss": 0.3767, "rewards/accuracies": 1.0, "rewards/chosen": 1.3909212350845337, "rewards/margins": 0.7820152640342712, "rewards/rejected": 0.6089059710502625, "step": 1820 }, { "epoch": 0.98, "learning_rate": 8.830846014424664e-08, "logits/chosen": -1.9819859266281128, "logits/rejected": -2.2172398567199707, "logps/chosen": -0.6377952098846436, "logps/rejected": -0.7768949866294861, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589059114456177, "rewards/margins": 0.018858134746551514, "rewards/rejected": 0.8400477766990662, "step": 1821 }, { "epoch": 0.98, "learning_rate": 8.8294422641048e-08, "logits/chosen": -2.173546075820923, "logits/rejected": -2.33782958984375, "logps/chosen": -18.726613998413086, "logps/rejected": -4.893085479736328, "loss": 0.7828, "rewards/accuracies": 0.0, "rewards/chosen": 0.6511415839195251, "rewards/margins": -0.17187821865081787, "rewards/rejected": 0.823019802570343, "step": 1822 }, { "epoch": 0.98, "learning_rate": 8.828037783293474e-08, "logits/chosen": -2.109377861022949, "logits/rejected": -2.120532989501953, "logps/chosen": -7.433019161224365, "logps/rejected": -2.3840034008026123, "loss": 0.5322, "rewards/accuracies": 1.0, "rewards/chosen": 1.0774221420288086, "rewards/margins": 0.3527523875236511, "rewards/rejected": 0.7246697545051575, "step": 1823 }, { "epoch": 0.98, "learning_rate": 8.826632572258601e-08, "logits/chosen": -2.103614568710327, "logits/rejected": -2.233645439147949, "logps/chosen": -0.6185182929039001, "logps/rejected": -0.6147915124893188, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.8222723007202148, "rewards/margins": 0.03914123773574829, "rewards/rejected": 0.7831310629844666, "step": 1824 }, { "epoch": 0.98, "learning_rate": 8.825226631268232e-08, "logits/chosen": -2.125365734100342, "logits/rejected": -2.1265389919281006, "logps/chosen": -0.49667632579803467, "logps/rejected": -4.796531677246094, "loss": 0.4858, "rewards/accuracies": 1.0, "rewards/chosen": 0.9394340515136719, "rewards/margins": 0.46914052963256836, "rewards/rejected": 0.4702935218811035, "step": 1825 }, { "epoch": 0.98, "learning_rate": 8.823819960590562e-08, "logits/chosen": -1.9741677045822144, "logits/rejected": -1.976919174194336, "logps/chosen": -1.1823927164077759, "logps/rejected": -1.9760499000549316, "loss": 0.5143, "rewards/accuracies": 1.0, "rewards/chosen": 1.1387311220169067, "rewards/margins": 0.39672428369522095, "rewards/rejected": 0.7420068383216858, "step": 1826 }, { "epoch": 0.99, "learning_rate": 8.822412560493923e-08, "logits/chosen": -2.028892755508423, "logits/rejected": -2.0216786861419678, "logps/chosen": -3.4540112018585205, "logps/rejected": -2.692420482635498, "loss": 0.628, "rewards/accuracies": 1.0, "rewards/chosen": 0.9117135405540466, "rewards/margins": 0.13487887382507324, "rewards/rejected": 0.7768346667289734, "step": 1827 }, { "epoch": 0.99, "learning_rate": 8.821004431246784e-08, "logits/chosen": -2.1657729148864746, "logits/rejected": -2.171626567840576, "logps/chosen": -4.743514537811279, "logps/rejected": -6.303450584411621, "loss": 0.4068, "rewards/accuracies": 1.0, "rewards/chosen": 1.5661388635635376, "rewards/margins": 0.6892887949943542, "rewards/rejected": 0.8768500685691833, "step": 1828 }, { "epoch": 0.99, "learning_rate": 8.819595573117757e-08, "logits/chosen": -2.048586130142212, "logits/rejected": -2.2508695125579834, "logps/chosen": -1.1753172874450684, "logps/rejected": -1.3422636985778809, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.9230745434761047, "rewards/margins": 0.03195875883102417, "rewards/rejected": 0.8911157846450806, "step": 1829 }, { "epoch": 0.99, "learning_rate": 8.818185986375589e-08, "logits/chosen": -1.9118326902389526, "logits/rejected": -2.195411443710327, "logps/chosen": -0.4852985739707947, "logps/rejected": -0.4914266765117645, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.7847592234611511, "rewards/margins": 0.02107638120651245, "rewards/rejected": 0.7636828422546387, "step": 1830 }, { "epoch": 0.99, "learning_rate": 8.816775671289169e-08, "logits/chosen": -2.1079235076904297, "logits/rejected": -2.043205976486206, "logps/chosen": -12.846097946166992, "logps/rejected": -7.596085548400879, "loss": 0.8233, "rewards/accuracies": 0.0, "rewards/chosen": 0.4256574809551239, "rewards/margins": -0.24532440304756165, "rewards/rejected": 0.6709818840026855, "step": 1831 }, { "epoch": 0.99, "learning_rate": 8.815364628127525e-08, "logits/chosen": -2.1123509407043457, "logits/rejected": -2.1162331104278564, "logps/chosen": -2.603275775909424, "logps/rejected": -4.185113430023193, "loss": 0.4752, "rewards/accuracies": 1.0, "rewards/chosen": 1.009635329246521, "rewards/margins": 0.49715471267700195, "rewards/rejected": 0.512480616569519, "step": 1832 }, { "epoch": 0.99, "learning_rate": 8.813952857159819e-08, "logits/chosen": -2.147967576980591, "logits/rejected": -2.138871669769287, "logps/chosen": -1.4705690145492554, "logps/rejected": -5.703281879425049, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 1.0410014390945435, "rewards/margins": 0.5236132144927979, "rewards/rejected": 0.5173882246017456, "step": 1833 }, { "epoch": 0.99, "learning_rate": 8.812540358655361e-08, "logits/chosen": -2.0741260051727295, "logits/rejected": -2.2937419414520264, "logps/chosen": -0.189576655626297, "logps/rejected": -0.19402818381786346, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9273895621299744, "rewards/margins": 0.019131481647491455, "rewards/rejected": 0.9082580804824829, "step": 1834 }, { "epoch": 0.99, "learning_rate": 8.811127132883589e-08, "logits/chosen": -2.0550668239593506, "logits/rejected": -2.0622832775115967, "logps/chosen": -0.7888556718826294, "logps/rejected": -7.059007167816162, "loss": 0.4154, "rewards/accuracies": 1.0, "rewards/chosen": 1.1436079740524292, "rewards/margins": 0.6636353731155396, "rewards/rejected": 0.47997260093688965, "step": 1835 }, { "epoch": 0.99, "learning_rate": 8.809713180114091e-08, "logits/chosen": -2.061699390411377, "logits/rejected": -2.3015925884246826, "logps/chosen": -0.6194781064987183, "logps/rejected": -0.5780225992202759, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 0.8937854766845703, "rewards/margins": 0.03804069757461548, "rewards/rejected": 0.8557447791099548, "step": 1836 }, { "epoch": 0.99, "learning_rate": 8.808298500616582e-08, "logits/chosen": -2.1617136001586914, "logits/rejected": -1.9974499940872192, "logps/chosen": -41.57670974731445, "logps/rejected": -4.565208435058594, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 1.6430110931396484, "rewards/margins": 0.9266257882118225, "rewards/rejected": 0.7163853049278259, "step": 1837 }, { "epoch": 0.99, "learning_rate": 8.806883094660927e-08, "logits/chosen": -2.002199411392212, "logits/rejected": -2.3212761878967285, "logps/chosen": -2.2871978282928467, "logps/rejected": -0.8478348255157471, "loss": 0.7114, "rewards/accuracies": 0.0, "rewards/chosen": 0.9994015097618103, "rewards/margins": -0.03611558675765991, "rewards/rejected": 1.0355170965194702, "step": 1838 }, { "epoch": 0.99, "learning_rate": 8.80546696251712e-08, "logits/chosen": -2.0295159816741943, "logits/rejected": -2.00972318649292, "logps/chosen": -6.988550186157227, "logps/rejected": -5.780521392822266, "loss": 0.3731, "rewards/accuracies": 1.0, "rewards/chosen": 1.3357242345809937, "rewards/margins": 0.7935933470726013, "rewards/rejected": 0.5421308875083923, "step": 1839 }, { "epoch": 0.99, "learning_rate": 8.804050104455298e-08, "logits/chosen": -2.18361759185791, "logits/rejected": -2.1953818798065186, "logps/chosen": -2.289503574371338, "logps/rejected": -2.489565849304199, "loss": 0.5013, "rewards/accuracies": 1.0, "rewards/chosen": 1.271366000175476, "rewards/margins": 0.42957043647766113, "rewards/rejected": 0.8417955636978149, "step": 1840 }, { "epoch": 0.99, "learning_rate": 8.802632520745739e-08, "logits/chosen": -2.230891466140747, "logits/rejected": -2.250906229019165, "logps/chosen": -17.502267837524414, "logps/rejected": -18.70749282836914, "loss": 0.55, "rewards/accuracies": 1.0, "rewards/chosen": 1.314312219619751, "rewards/margins": 0.3101789951324463, "rewards/rejected": 1.0041332244873047, "step": 1841 }, { "epoch": 0.99, "learning_rate": 8.801214211658852e-08, "logits/chosen": -2.189685583114624, "logits/rejected": -2.1891379356384277, "logps/chosen": -1.9589898586273193, "logps/rejected": -5.011348724365234, "loss": 0.427, "rewards/accuracies": 1.0, "rewards/chosen": 1.0762208700180054, "rewards/margins": 0.6299426555633545, "rewards/rejected": 0.4462781846523285, "step": 1842 }, { "epoch": 0.99, "learning_rate": 8.799795177465193e-08, "logits/chosen": -2.0717666149139404, "logits/rejected": -2.061627149581909, "logps/chosen": -1.4010041952133179, "logps/rejected": -10.550393104553223, "loss": 0.5754, "rewards/accuracies": 1.0, "rewards/chosen": 0.9843711256980896, "rewards/margins": 0.2511524558067322, "rewards/rejected": 0.7332186698913574, "step": 1843 }, { "epoch": 0.99, "learning_rate": 8.798375418435451e-08, "logits/chosen": -2.0651614665985107, "logits/rejected": -2.25748610496521, "logps/chosen": -0.32739895582199097, "logps/rejected": -0.32558420300483704, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.9073411822319031, "rewards/margins": -0.0010485649108886719, "rewards/rejected": 0.9083897471427917, "step": 1844 }, { "epoch": 1.0, "learning_rate": 8.796954934840452e-08, "logits/chosen": -2.083282232284546, "logits/rejected": -2.2303645610809326, "logps/chosen": -1.2509839534759521, "logps/rejected": -1.1571427583694458, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.7347966432571411, "rewards/margins": -0.002824842929840088, "rewards/rejected": 0.7376214861869812, "step": 1845 }, { "epoch": 1.0, "learning_rate": 8.795533726951166e-08, "logits/chosen": -1.968178629875183, "logits/rejected": -1.9680763483047485, "logps/chosen": -1.8653841018676758, "logps/rejected": -0.9907634258270264, "loss": 0.5838, "rewards/accuracies": 1.0, "rewards/chosen": 1.1085097789764404, "rewards/margins": 0.23214125633239746, "rewards/rejected": 0.876368522644043, "step": 1846 }, { "epoch": 1.0, "learning_rate": 8.794111795038696e-08, "logits/chosen": -1.9868035316467285, "logits/rejected": -1.978500485420227, "logps/chosen": -5.7607574462890625, "logps/rejected": -4.8993000984191895, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 1.5047181844711304, "rewards/margins": 0.9978415369987488, "rewards/rejected": 0.5068766474723816, "step": 1847 }, { "epoch": 1.0, "learning_rate": 8.792689139374285e-08, "logits/chosen": -1.9756746292114258, "logits/rejected": -2.23653244972229, "logps/chosen": -1.0140613317489624, "logps/rejected": -1.0619003772735596, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.85296231508255, "rewards/margins": 0.015474021434783936, "rewards/rejected": 0.8374882936477661, "step": 1848 }, { "epoch": 1.0, "learning_rate": 8.791265760229314e-08, "logits/chosen": -2.132002592086792, "logits/rejected": -2.1733741760253906, "logps/chosen": -5.265029430389404, "logps/rejected": -11.121811866760254, "loss": 0.3496, "rewards/accuracies": 1.0, "rewards/chosen": 1.49774968624115, "rewards/margins": 0.8709110617637634, "rewards/rejected": 0.6268386244773865, "step": 1849 }, { "epoch": 1.0, "learning_rate": 8.789841657875303e-08, "logits/chosen": -2.0801594257354736, "logits/rejected": -2.2767486572265625, "logps/chosen": -0.6034179925918579, "logps/rejected": -0.6684168577194214, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9198317527770996, "rewards/margins": 0.01769047975540161, "rewards/rejected": 0.902141273021698, "step": 1850 }, { "epoch": 1.0, "learning_rate": 8.788416832583908e-08, "logits/chosen": -2.013981342315674, "logits/rejected": -2.2602438926696777, "logps/chosen": -1.058358907699585, "logps/rejected": -1.039596438407898, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0242246389389038, "rewards/margins": 0.011049270629882812, "rewards/rejected": 1.013175368309021, "step": 1851 }, { "epoch": 1.0, "learning_rate": 8.786991284626923e-08, "logits/chosen": -2.025301933288574, "logits/rejected": -2.026672601699829, "logps/chosen": -1.6627635955810547, "logps/rejected": -2.044372081756592, "loss": 0.513, "rewards/accuracies": 1.0, "rewards/chosen": 1.0607126951217651, "rewards/margins": 0.40010786056518555, "rewards/rejected": 0.6606048345565796, "step": 1852 }, { "epoch": 1.0, "learning_rate": 8.785565014276284e-08, "logits/chosen": -2.342324733734131, "logits/rejected": -2.289682626724243, "logps/chosen": -25.208229064941406, "logps/rejected": -4.6700286865234375, "loss": 0.2232, "rewards/accuracies": 1.0, "rewards/chosen": 1.8970688581466675, "rewards/margins": 1.3860418796539307, "rewards/rejected": 0.5110270380973816, "step": 1853 }, { "epoch": 1.0, "learning_rate": 8.784138021804058e-08, "logits/chosen": -1.9935743808746338, "logits/rejected": -2.2245936393737793, "logps/chosen": -0.6579334735870361, "logps/rejected": -0.6000222563743591, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.8920254707336426, "rewards/margins": -0.011392772197723389, "rewards/rejected": 0.903418242931366, "step": 1854 }, { "epoch": 1.0, "learning_rate": 8.782710307482453e-08, "logits/chosen": -2.046734094619751, "logits/rejected": -2.0455870628356934, "logps/chosen": -1.1675268411636353, "logps/rejected": -1.7244935035705566, "loss": 0.6246, "rewards/accuracies": 1.0, "rewards/chosen": 0.9740964770317078, "rewards/margins": 0.1421237587928772, "rewards/rejected": 0.8319727182388306, "step": 1855 }, { "epoch": 1.0, "learning_rate": 8.781281871583819e-08, "logits/chosen": -1.9845751523971558, "logits/rejected": -1.9833221435546875, "logps/chosen": -1.9101440906524658, "logps/rejected": -3.8981118202209473, "loss": 0.494, "rewards/accuracies": 1.0, "rewards/chosen": 0.9401853680610657, "rewards/margins": 0.4481789171695709, "rewards/rejected": 0.49200645089149475, "step": 1856 }, { "epoch": 1.0, "learning_rate": 8.779852714380635e-08, "logits/chosen": -2.0692756175994873, "logits/rejected": -2.0684444904327393, "logps/chosen": -4.022620677947998, "logps/rejected": -10.973926544189453, "loss": 0.4242, "rewards/accuracies": 1.0, "rewards/chosen": 0.9987918734550476, "rewards/margins": 0.6378456354141235, "rewards/rejected": 0.36094626784324646, "step": 1857 }, { "epoch": 1.0, "learning_rate": 8.778422836145528e-08, "logits/chosen": -2.1634743213653564, "logits/rejected": -2.1636953353881836, "logps/chosen": -1.156124234199524, "logps/rejected": -4.9331536293029785, "loss": 0.4988, "rewards/accuracies": 1.0, "rewards/chosen": 1.0164954662322998, "rewards/margins": 0.43586188554763794, "rewards/rejected": 0.5806335806846619, "step": 1858 }, { "epoch": 1.0, "learning_rate": 8.776992237151248e-08, "logits/chosen": -2.0201151371002197, "logits/rejected": -2.255782127380371, "logps/chosen": -0.40902018547058105, "logps/rejected": -0.4447272717952728, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.8751490712165833, "rewards/margins": 0.011394679546356201, "rewards/rejected": 0.863754391670227, "step": 1859 }, { "epoch": 1.0, "learning_rate": 8.775560917670699e-08, "logits/chosen": -2.3144736289978027, "logits/rejected": -2.0760016441345215, "logps/chosen": -50.589317321777344, "logps/rejected": -5.499085903167725, "loss": 0.2431, "rewards/accuracies": 1.0, "rewards/chosen": 1.799153208732605, "rewards/margins": 1.290095567703247, "rewards/rejected": 0.5090577006340027, "step": 1860 }, { "epoch": 1.0, "learning_rate": 8.774128877976911e-08, "logits/chosen": -2.0785462856292725, "logits/rejected": -2.316326141357422, "logps/chosen": -3.7813289165496826, "logps/rejected": -13.831212997436523, "loss": 0.7077, "rewards/accuracies": 0.0, "rewards/chosen": 0.9526692628860474, "rewards/margins": -0.028941035270690918, "rewards/rejected": 0.9816102981567383, "step": 1861 }, { "epoch": 1.0, "learning_rate": 8.772696118343059e-08, "logits/chosen": -2.1457457542419434, "logits/rejected": -2.3224709033966064, "logps/chosen": -2.9055588245391846, "logps/rejected": -2.8950698375701904, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.8130916953086853, "rewards/margins": 0.014878511428833008, "rewards/rejected": 0.7982131838798523, "step": 1862 }, { "epoch": 1.0, "learning_rate": 8.771262639042445e-08, "logits/chosen": -2.061981439590454, "logits/rejected": -2.241166830062866, "logps/chosen": -3.215097665786743, "logps/rejected": -6.168379783630371, "loss": 0.6548, "rewards/accuracies": 1.0, "rewards/chosen": 0.5668482184410095, "rewards/margins": 0.07824775576591492, "rewards/rejected": 0.4886004626750946, "step": 1863 }, { "epoch": 1.01, "learning_rate": 8.769828440348518e-08, "logits/chosen": -2.164703845977783, "logits/rejected": -2.170011520385742, "logps/chosen": -0.8832436203956604, "logps/rejected": -5.332999229431152, "loss": 0.4345, "rewards/accuracies": 1.0, "rewards/chosen": 0.9940893054008484, "rewards/margins": 0.6083778142929077, "rewards/rejected": 0.3857114911079407, "step": 1864 }, { "epoch": 1.01, "learning_rate": 8.768393522534862e-08, "logits/chosen": -2.014265537261963, "logits/rejected": -2.239380121231079, "logps/chosen": -0.5597437024116516, "logps/rejected": -0.6545502543449402, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 0.9396705031394958, "rewards/margins": -0.008951067924499512, "rewards/rejected": 0.9486215710639954, "step": 1865 }, { "epoch": 1.01, "learning_rate": 8.766957885875197e-08, "logits/chosen": -2.088573455810547, "logits/rejected": -2.3464410305023193, "logps/chosen": -2.0101654529571533, "logps/rejected": -1.7176389694213867, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 1.0037615299224854, "rewards/margins": -0.018056750297546387, "rewards/rejected": 1.0218182802200317, "step": 1866 }, { "epoch": 1.01, "learning_rate": 8.765521530643378e-08, "logits/chosen": -2.0408380031585693, "logits/rejected": -2.0374109745025635, "logps/chosen": -7.6476335525512695, "logps/rejected": -7.698282241821289, "loss": 0.4252, "rewards/accuracies": 1.0, "rewards/chosen": 1.1299201250076294, "rewards/margins": 0.6349670886993408, "rewards/rejected": 0.49495306611061096, "step": 1867 }, { "epoch": 1.01, "learning_rate": 8.764084457113401e-08, "logits/chosen": -2.0342907905578613, "logits/rejected": -2.2483513355255127, "logps/chosen": -0.5937633514404297, "logps/rejected": -0.5387352705001831, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8525437712669373, "rewards/margins": 0.015158772468566895, "rewards/rejected": 0.8373849987983704, "step": 1868 }, { "epoch": 1.01, "learning_rate": 8.762646665559396e-08, "logits/chosen": -2.107675552368164, "logits/rejected": -2.317826986312866, "logps/chosen": -0.7441526651382446, "logps/rejected": -0.7350124716758728, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.8659858703613281, "rewards/margins": -0.003213346004486084, "rewards/rejected": 0.8691992163658142, "step": 1869 }, { "epoch": 1.01, "learning_rate": 8.761208156255633e-08, "logits/chosen": -2.0566723346710205, "logits/rejected": -2.221489191055298, "logps/chosen": -1.0290882587432861, "logps/rejected": -0.9742677807807922, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.8674063682556152, "rewards/margins": -0.0035046935081481934, "rewards/rejected": 0.8709110617637634, "step": 1870 }, { "epoch": 1.01, "learning_rate": 8.759768929476516e-08, "logits/chosen": -2.049203395843506, "logits/rejected": -2.2529571056365967, "logps/chosen": -8.794023513793945, "logps/rejected": -2.8169853687286377, "loss": 0.7501, "rewards/accuracies": 0.0, "rewards/chosen": 0.8697357177734375, "rewards/margins": -0.11080902814865112, "rewards/rejected": 0.9805447459220886, "step": 1871 }, { "epoch": 1.01, "learning_rate": 8.758328985496587e-08, "logits/chosen": -2.0303547382354736, "logits/rejected": -2.249682903289795, "logps/chosen": -0.6738154888153076, "logps/rejected": -0.6500688195228577, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8235823512077332, "rewards/margins": 0.016301214694976807, "rewards/rejected": 0.8072811365127563, "step": 1872 }, { "epoch": 1.01, "learning_rate": 8.756888324590525e-08, "logits/chosen": -2.118410587310791, "logits/rejected": -2.3005332946777344, "logps/chosen": -2.813601493835449, "logps/rejected": -2.854163408279419, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.6218889355659485, "rewards/margins": 0.027283012866973877, "rewards/rejected": 0.5946059226989746, "step": 1873 }, { "epoch": 1.01, "learning_rate": 8.755446947033145e-08, "logits/chosen": -2.145261287689209, "logits/rejected": -2.277740240097046, "logps/chosen": -0.7710721492767334, "logps/rejected": -0.8081674575805664, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9118730425834656, "rewards/margins": 0.01798349618911743, "rewards/rejected": 0.8938895463943481, "step": 1874 }, { "epoch": 1.01, "learning_rate": 8.754004853099401e-08, "logits/chosen": -2.249337673187256, "logits/rejected": -2.1946942806243896, "logps/chosen": -13.327838897705078, "logps/rejected": -10.177143096923828, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 1.8098796606063843, "rewards/margins": 1.3456950187683105, "rewards/rejected": 0.46418458223342896, "step": 1875 }, { "epoch": 1.01, "learning_rate": 8.75256204306438e-08, "logits/chosen": -2.248044967651367, "logits/rejected": -2.2876129150390625, "logps/chosen": -1.226967692375183, "logps/rejected": -1.246956706047058, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.8814568519592285, "rewards/margins": 0.01944500207901001, "rewards/rejected": 0.8620118498802185, "step": 1876 }, { "epoch": 1.01, "learning_rate": 8.75111851720331e-08, "logits/chosen": -2.286926746368408, "logits/rejected": -2.2426564693450928, "logps/chosen": -7.95941686630249, "logps/rejected": -7.814596176147461, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.4342927634716034, "rewards/margins": 0.009138554334640503, "rewards/rejected": 0.4251542091369629, "step": 1877 }, { "epoch": 1.01, "learning_rate": 8.74967427579155e-08, "logits/chosen": -2.123002290725708, "logits/rejected": -2.005798101425171, "logps/chosen": -25.933876037597656, "logps/rejected": -2.454676389694214, "loss": 0.4023, "rewards/accuracies": 1.0, "rewards/chosen": 1.3538891077041626, "rewards/margins": 0.7027928233146667, "rewards/rejected": 0.6510962843894958, "step": 1878 }, { "epoch": 1.01, "learning_rate": 8.748229319104601e-08, "logits/chosen": -2.0060455799102783, "logits/rejected": -2.2491843700408936, "logps/chosen": -0.6461340188980103, "logps/rejected": -0.6470837593078613, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9981607794761658, "rewards/margins": 0.01626187562942505, "rewards/rejected": 0.9818989038467407, "step": 1879 }, { "epoch": 1.01, "learning_rate": 8.746783647418097e-08, "logits/chosen": -2.0049009323120117, "logits/rejected": -1.9860179424285889, "logps/chosen": -9.794931411743164, "logps/rejected": -10.505280494689941, "loss": 0.5085, "rewards/accuracies": 1.0, "rewards/chosen": 1.21575129032135, "rewards/margins": 0.41132766008377075, "rewards/rejected": 0.8044236302375793, "step": 1880 }, { "epoch": 1.01, "learning_rate": 8.74533726100781e-08, "logits/chosen": -2.06146240234375, "logits/rejected": -2.281623601913452, "logps/chosen": -1.2090080976486206, "logps/rejected": -1.3262238502502441, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.8880574107170105, "rewards/margins": 0.035845816135406494, "rewards/rejected": 0.852211594581604, "step": 1881 }, { "epoch": 1.02, "learning_rate": 8.743890160149648e-08, "logits/chosen": -1.9940396547317505, "logits/rejected": -1.9985342025756836, "logps/chosen": -1.6790971755981445, "logps/rejected": -4.727569103240967, "loss": 0.4251, "rewards/accuracies": 1.0, "rewards/chosen": 1.0444687604904175, "rewards/margins": 0.6354468464851379, "rewards/rejected": 0.40902191400527954, "step": 1882 }, { "epoch": 1.02, "learning_rate": 8.742442345119653e-08, "logits/chosen": -2.0688588619232178, "logits/rejected": -2.2547457218170166, "logps/chosen": -0.9349108934402466, "logps/rejected": -1.0935125350952148, "loss": 0.7263, "rewards/accuracies": 0.0, "rewards/chosen": 0.8204480409622192, "rewards/margins": -0.06532388925552368, "rewards/rejected": 0.8857719302177429, "step": 1883 }, { "epoch": 1.02, "learning_rate": 8.740993816194009e-08, "logits/chosen": -2.1015281677246094, "logits/rejected": -2.1011710166931152, "logps/chosen": -4.6447649002075195, "logps/rejected": -2.244805097579956, "loss": 0.2633, "rewards/accuracies": 1.0, "rewards/chosen": 1.798106074333191, "rewards/margins": 1.1998546123504639, "rewards/rejected": 0.598251461982727, "step": 1884 }, { "epoch": 1.02, "learning_rate": 8.739544573649029e-08, "logits/chosen": -2.134650230407715, "logits/rejected": -2.0311644077301025, "logps/chosen": -27.412195205688477, "logps/rejected": -3.0297114849090576, "loss": 0.2432, "rewards/accuracies": 1.0, "rewards/chosen": 1.9106791019439697, "rewards/margins": 1.2899935245513916, "rewards/rejected": 0.6206855773925781, "step": 1885 }, { "epoch": 1.02, "learning_rate": 8.738094617761167e-08, "logits/chosen": -1.9816681146621704, "logits/rejected": -2.276352882385254, "logps/chosen": -0.47342559695243835, "logps/rejected": -0.4804306626319885, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9818798899650574, "rewards/margins": 0.020437300205230713, "rewards/rejected": 0.9614425897598267, "step": 1886 }, { "epoch": 1.02, "learning_rate": 8.736643948807012e-08, "logits/chosen": -2.080270528793335, "logits/rejected": -2.299579381942749, "logps/chosen": -12.71896743774414, "logps/rejected": -6.850667953491211, "loss": 0.8173, "rewards/accuracies": 0.0, "rewards/chosen": 0.651471734046936, "rewards/margins": -0.23458898067474365, "rewards/rejected": 0.8860607147216797, "step": 1887 }, { "epoch": 1.02, "learning_rate": 8.73519256706329e-08, "logits/chosen": -2.0801262855529785, "logits/rejected": -2.082315683364868, "logps/chosen": -2.908430576324463, "logps/rejected": -6.323200702667236, "loss": 0.3871, "rewards/accuracies": 1.0, "rewards/chosen": 1.410893440246582, "rewards/margins": 0.7492839097976685, "rewards/rejected": 0.6616095304489136, "step": 1888 }, { "epoch": 1.02, "learning_rate": 8.733740472806858e-08, "logits/chosen": -2.1287357807159424, "logits/rejected": -2.1734225749969482, "logps/chosen": -5.517607688903809, "logps/rejected": -11.047445297241211, "loss": 0.3594, "rewards/accuracies": 1.0, "rewards/chosen": 1.472491979598999, "rewards/margins": 0.8382167220115662, "rewards/rejected": 0.6342752575874329, "step": 1889 }, { "epoch": 1.02, "learning_rate": 8.732287666314714e-08, "logits/chosen": -2.1434693336486816, "logits/rejected": -2.2829055786132812, "logps/chosen": -0.47724419832229614, "logps/rejected": -0.44852542877197266, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.9155446887016296, "rewards/margins": 0.010926127433776855, "rewards/rejected": 0.9046185612678528, "step": 1890 }, { "epoch": 1.02, "learning_rate": 8.730834147863992e-08, "logits/chosen": -2.104196310043335, "logits/rejected": -2.1189815998077393, "logps/chosen": -7.951521873474121, "logps/rejected": -4.441303253173828, "loss": 0.4372, "rewards/accuracies": 1.0, "rewards/chosen": 1.2703137397766113, "rewards/margins": 0.6007423996925354, "rewards/rejected": 0.6695713400840759, "step": 1891 }, { "epoch": 1.02, "learning_rate": 8.729379917731961e-08, "logits/chosen": -2.1551995277404785, "logits/rejected": -2.2566912174224854, "logps/chosen": -5.23817253112793, "logps/rejected": -2.831239700317383, "loss": 0.7148, "rewards/accuracies": 0.0, "rewards/chosen": 0.7720698714256287, "rewards/margins": -0.0428805947303772, "rewards/rejected": 0.8149504661560059, "step": 1892 }, { "epoch": 1.02, "learning_rate": 8.727924976196021e-08, "logits/chosen": -2.1181814670562744, "logits/rejected": -2.29573655128479, "logps/chosen": -1.9241026639938354, "logps/rejected": -1.956650733947754, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.9656352996826172, "rewards/margins": 0.024929344654083252, "rewards/rejected": 0.9407059550285339, "step": 1893 }, { "epoch": 1.02, "learning_rate": 8.726469323533714e-08, "logits/chosen": -2.028766632080078, "logits/rejected": -2.2990715503692627, "logps/chosen": -0.40184056758880615, "logps/rejected": -0.41927725076675415, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.8924989104270935, "rewards/margins": 0.01441413164138794, "rewards/rejected": 0.8780847787857056, "step": 1894 }, { "epoch": 1.02, "learning_rate": 8.725012960022717e-08, "logits/chosen": -2.042597532272339, "logits/rejected": -2.2322707176208496, "logps/chosen": -4.293673992156982, "logps/rejected": -4.581733703613281, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.6497852802276611, "rewards/margins": 0.01875537633895874, "rewards/rejected": 0.6310299038887024, "step": 1895 }, { "epoch": 1.02, "learning_rate": 8.723555885940839e-08, "logits/chosen": -2.094472646713257, "logits/rejected": -2.032897472381592, "logps/chosen": -13.902554512023926, "logps/rejected": -20.129745483398438, "loss": 0.278, "rewards/accuracies": 1.0, "rewards/chosen": 1.5773314237594604, "rewards/margins": 1.1378344297409058, "rewards/rejected": 0.4394969940185547, "step": 1896 }, { "epoch": 1.02, "learning_rate": 8.722098101566027e-08, "logits/chosen": -2.0150206089019775, "logits/rejected": -2.0222370624542236, "logps/chosen": -2.2339186668395996, "logps/rejected": -4.173457145690918, "loss": 0.4621, "rewards/accuracies": 1.0, "rewards/chosen": 1.0858691930770874, "rewards/margins": 0.531939685344696, "rewards/rejected": 0.5539295077323914, "step": 1897 }, { "epoch": 1.02, "learning_rate": 8.720639607176364e-08, "logits/chosen": -2.1711843013763428, "logits/rejected": -2.0628325939178467, "logps/chosen": -42.42844772338867, "logps/rejected": -2.052694797515869, "loss": 0.2685, "rewards/accuracies": 1.0, "rewards/chosen": 1.940555214881897, "rewards/margins": 1.1778464317321777, "rewards/rejected": 0.762708842754364, "step": 1898 }, { "epoch": 1.02, "learning_rate": 8.719180403050063e-08, "logits/chosen": -2.195723533630371, "logits/rejected": -2.197866916656494, "logps/chosen": -0.4358935058116913, "logps/rejected": -3.160489797592163, "loss": 0.4927, "rewards/accuracies": 1.0, "rewards/chosen": 1.0287703275680542, "rewards/margins": 0.45132631063461304, "rewards/rejected": 0.5774440169334412, "step": 1899 }, { "epoch": 1.02, "learning_rate": 8.717720489465483e-08, "logits/chosen": -2.052781343460083, "logits/rejected": -2.0532941818237305, "logps/chosen": -4.801971435546875, "logps/rejected": -2.1962015628814697, "loss": 0.2914, "rewards/accuracies": 1.0, "rewards/chosen": 1.689103126525879, "rewards/margins": 1.0840152502059937, "rewards/rejected": 0.6050878763198853, "step": 1900 }, { "epoch": 1.03, "learning_rate": 8.71625986670111e-08, "logits/chosen": -2.1798040866851807, "logits/rejected": -2.209204912185669, "logps/chosen": -9.498851776123047, "logps/rejected": -10.496904373168945, "loss": 0.6393, "rewards/accuracies": 1.0, "rewards/chosen": 1.1179192066192627, "rewards/margins": 0.11065888404846191, "rewards/rejected": 1.0072603225708008, "step": 1901 }, { "epoch": 1.03, "learning_rate": 8.714798535035566e-08, "logits/chosen": -2.080443859100342, "logits/rejected": -2.3003411293029785, "logps/chosen": -0.7025197744369507, "logps/rejected": -0.6905646324157715, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9863874316215515, "rewards/margins": 0.00131988525390625, "rewards/rejected": 0.9850675463676453, "step": 1902 }, { "epoch": 1.03, "learning_rate": 8.713336494747611e-08, "logits/chosen": -1.9927396774291992, "logits/rejected": -2.265742301940918, "logps/chosen": -1.3506300449371338, "logps/rejected": -1.2653204202651978, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.9865052103996277, "rewards/margins": 0.0026094913482666016, "rewards/rejected": 0.9838957190513611, "step": 1903 }, { "epoch": 1.03, "learning_rate": 8.711873746116138e-08, "logits/chosen": -2.114393472671509, "logits/rejected": -2.115736722946167, "logps/chosen": -1.3151936531066895, "logps/rejected": -3.243394613265991, "loss": 0.515, "rewards/accuracies": 1.0, "rewards/chosen": 1.0299221277236938, "rewards/margins": 0.3951719403266907, "rewards/rejected": 0.6347501873970032, "step": 1904 }, { "epoch": 1.03, "learning_rate": 8.710410289420177e-08, "logits/chosen": -2.1092588901519775, "logits/rejected": -2.0055437088012695, "logps/chosen": -19.369949340820312, "logps/rejected": -4.693444728851318, "loss": 0.3836, "rewards/accuracies": 1.0, "rewards/chosen": 1.4649051427841187, "rewards/margins": 0.760140597820282, "rewards/rejected": 0.7047645449638367, "step": 1905 }, { "epoch": 1.03, "learning_rate": 8.708946124938892e-08, "logits/chosen": -2.0210320949554443, "logits/rejected": -2.0160059928894043, "logps/chosen": -4.530328750610352, "logps/rejected": -2.7783894538879395, "loss": 0.3711, "rewards/accuracies": 1.0, "rewards/chosen": 1.5255005359649658, "rewards/margins": 0.8000566363334656, "rewards/rejected": 0.7254438996315002, "step": 1906 }, { "epoch": 1.03, "learning_rate": 8.70748125295158e-08, "logits/chosen": -2.0062379837036133, "logits/rejected": -2.0487263202667236, "logps/chosen": -1.8570127487182617, "logps/rejected": -15.409159660339355, "loss": 0.6712, "rewards/accuracies": 1.0, "rewards/chosen": 0.6962599158287048, "rewards/margins": 0.044341206550598145, "rewards/rejected": 0.6519187092781067, "step": 1907 }, { "epoch": 1.03, "learning_rate": 8.706015673737677e-08, "logits/chosen": -2.2068605422973633, "logits/rejected": -2.2018558979034424, "logps/chosen": -5.22686767578125, "logps/rejected": -2.4626646041870117, "loss": 0.4271, "rewards/accuracies": 1.0, "rewards/chosen": 1.2364333868026733, "rewards/margins": 0.6294715404510498, "rewards/rejected": 0.6069618463516235, "step": 1908 }, { "epoch": 1.03, "learning_rate": 8.704549387576753e-08, "logits/chosen": -2.0412302017211914, "logits/rejected": -2.25917387008667, "logps/chosen": -1.802769660949707, "logps/rejected": -1.7317476272583008, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0111018419265747, "rewards/margins": 0.030206143856048584, "rewards/rejected": 0.9808956980705261, "step": 1909 }, { "epoch": 1.03, "learning_rate": 8.70308239474851e-08, "logits/chosen": -2.1249427795410156, "logits/rejected": -2.215423822402954, "logps/chosen": -0.5681819915771484, "logps/rejected": -0.5835235714912415, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8901165127754211, "rewards/margins": 0.012474894523620605, "rewards/rejected": 0.8776416182518005, "step": 1910 }, { "epoch": 1.03, "learning_rate": 8.701614695532786e-08, "logits/chosen": -1.9678230285644531, "logits/rejected": -2.227236747741699, "logps/chosen": -0.9296361207962036, "logps/rejected": -0.8650106191635132, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.8649141192436218, "rewards/margins": 0.0327991247177124, "rewards/rejected": 0.8321149945259094, "step": 1911 }, { "epoch": 1.03, "learning_rate": 8.700146290209556e-08, "logits/chosen": -1.9956517219543457, "logits/rejected": -2.245314836502075, "logps/chosen": -1.4326192140579224, "logps/rejected": -1.582155704498291, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.9593318104743958, "rewards/margins": -0.005081832408905029, "rewards/rejected": 0.9644136428833008, "step": 1912 }, { "epoch": 1.03, "learning_rate": 8.698677179058928e-08, "logits/chosen": -2.0582029819488525, "logits/rejected": -2.0629141330718994, "logps/chosen": -1.4673244953155518, "logps/rejected": -1.6947015523910522, "loss": 0.5085, "rewards/accuracies": 1.0, "rewards/chosen": 1.148188591003418, "rewards/margins": 0.4113422632217407, "rewards/rejected": 0.7368463277816772, "step": 1913 }, { "epoch": 1.03, "learning_rate": 8.697207362361143e-08, "logits/chosen": -2.023880958557129, "logits/rejected": -2.0273895263671875, "logps/chosen": -3.370391368865967, "logps/rejected": -0.6460684537887573, "loss": 0.5435, "rewards/accuracies": 1.0, "rewards/chosen": 1.1866867542266846, "rewards/margins": 0.325744092464447, "rewards/rejected": 0.8609426617622375, "step": 1914 }, { "epoch": 1.03, "learning_rate": 8.695736840396579e-08, "logits/chosen": -2.1019668579101562, "logits/rejected": -2.1045937538146973, "logps/chosen": -1.4289355278015137, "logps/rejected": -1.5254371166229248, "loss": 0.5534, "rewards/accuracies": 1.0, "rewards/chosen": 1.047093391418457, "rewards/margins": 0.30217868089675903, "rewards/rejected": 0.744914710521698, "step": 1915 }, { "epoch": 1.03, "learning_rate": 8.694265613445748e-08, "logits/chosen": -2.0193748474121094, "logits/rejected": -2.029944896697998, "logps/chosen": -8.23140811920166, "logps/rejected": -3.094998836517334, "loss": 0.3821, "rewards/accuracies": 1.0, "rewards/chosen": 1.5424609184265137, "rewards/margins": 0.7650042772293091, "rewards/rejected": 0.7774566411972046, "step": 1916 }, { "epoch": 1.03, "learning_rate": 8.692793681789298e-08, "logits/chosen": -2.1007795333862305, "logits/rejected": -2.306173086166382, "logps/chosen": -0.9127532839775085, "logps/rejected": -0.963611900806427, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.7984940409660339, "rewards/margins": 0.005066633224487305, "rewards/rejected": 0.7934274077415466, "step": 1917 }, { "epoch": 1.03, "learning_rate": 8.691321045708006e-08, "logits/chosen": -1.9740498065948486, "logits/rejected": -2.2482688426971436, "logps/chosen": -0.6519935727119446, "logps/rejected": -0.6092071533203125, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.879684567451477, "rewards/margins": -0.004065871238708496, "rewards/rejected": 0.8837504386901855, "step": 1918 }, { "epoch": 1.04, "learning_rate": 8.689847705482791e-08, "logits/chosen": -2.092207431793213, "logits/rejected": -2.1718409061431885, "logps/chosen": -1.895837426185608, "logps/rejected": -19.324438095092773, "loss": 0.4931, "rewards/accuracies": 1.0, "rewards/chosen": 1.2085871696472168, "rewards/margins": 0.4504094123840332, "rewards/rejected": 0.7581777572631836, "step": 1919 }, { "epoch": 1.04, "learning_rate": 8.688373661394698e-08, "logits/chosen": -2.1179139614105225, "logits/rejected": -2.283506393432617, "logps/chosen": -4.878110885620117, "logps/rejected": -4.739516735076904, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.7185776829719543, "rewards/margins": 0.006692171096801758, "rewards/rejected": 0.7118855118751526, "step": 1920 }, { "epoch": 1.04, "learning_rate": 8.686898913724916e-08, "logits/chosen": -2.1525049209594727, "logits/rejected": -2.1523239612579346, "logps/chosen": -2.918212890625, "logps/rejected": -12.693716049194336, "loss": 0.5708, "rewards/accuracies": 1.0, "rewards/chosen": 0.9379553198814392, "rewards/margins": 0.2618013620376587, "rewards/rejected": 0.6761539578437805, "step": 1921 }, { "epoch": 1.04, "learning_rate": 8.685423462754759e-08, "logits/chosen": -2.0616140365600586, "logits/rejected": -2.0639710426330566, "logps/chosen": -3.419297695159912, "logps/rejected": -2.552539825439453, "loss": 0.6675, "rewards/accuracies": 1.0, "rewards/chosen": 0.9246816635131836, "rewards/margins": 0.05197322368621826, "rewards/rejected": 0.8727084398269653, "step": 1922 }, { "epoch": 1.04, "learning_rate": 8.68394730876568e-08, "logits/chosen": -2.1438841819763184, "logits/rejected": -2.1438910961151123, "logps/chosen": -3.7404754161834717, "logps/rejected": -3.1884889602661133, "loss": 0.4682, "rewards/accuracies": 1.0, "rewards/chosen": 1.3456839323043823, "rewards/margins": 0.5156199336051941, "rewards/rejected": 0.8300639986991882, "step": 1923 }, { "epoch": 1.04, "learning_rate": 8.682470452039266e-08, "logits/chosen": -2.2960290908813477, "logits/rejected": -2.3448617458343506, "logps/chosen": -1.0158636569976807, "logps/rejected": -0.6631462574005127, "loss": 0.7078, "rewards/accuracies": 0.0, "rewards/chosen": 0.7424430847167969, "rewards/margins": -0.029132187366485596, "rewards/rejected": 0.7715752720832825, "step": 1924 }, { "epoch": 1.04, "learning_rate": 8.680992892857236e-08, "logits/chosen": -2.011075973510742, "logits/rejected": -2.0043790340423584, "logps/chosen": -6.27228307723999, "logps/rejected": -2.8361446857452393, "loss": 0.4449, "rewards/accuracies": 1.0, "rewards/chosen": 1.2649418115615845, "rewards/margins": 0.5792658925056458, "rewards/rejected": 0.6856759190559387, "step": 1925 }, { "epoch": 1.04, "learning_rate": 8.679514631501446e-08, "logits/chosen": -2.0884604454040527, "logits/rejected": -2.1072335243225098, "logps/chosen": -5.463382244110107, "logps/rejected": -3.10068941116333, "loss": 0.5271, "rewards/accuracies": 1.0, "rewards/chosen": 1.160056471824646, "rewards/margins": 0.365181565284729, "rewards/rejected": 0.794874906539917, "step": 1926 }, { "epoch": 1.04, "learning_rate": 8.67803566825388e-08, "logits/chosen": -2.0639195442199707, "logits/rejected": -2.2243661880493164, "logps/chosen": -0.8304547667503357, "logps/rejected": -0.8669253587722778, "loss": 0.658, "rewards/accuracies": 1.0, "rewards/chosen": 0.9581303000450134, "rewards/margins": 0.0716504454612732, "rewards/rejected": 0.8864798545837402, "step": 1927 }, { "epoch": 1.04, "learning_rate": 8.676556003396665e-08, "logits/chosen": -2.068309783935547, "logits/rejected": -2.0590291023254395, "logps/chosen": -2.623756170272827, "logps/rejected": -10.087381362915039, "loss": 0.3645, "rewards/accuracies": 1.0, "rewards/chosen": 0.9999741911888123, "rewards/margins": 0.8213177919387817, "rewards/rejected": 0.17865638434886932, "step": 1928 }, { "epoch": 1.04, "learning_rate": 8.675075637212055e-08, "logits/chosen": -1.9644200801849365, "logits/rejected": -1.9722230434417725, "logps/chosen": -4.887139797210693, "logps/rejected": -3.982083559036255, "loss": 0.2666, "rewards/accuracies": 1.0, "rewards/chosen": 1.6959362030029297, "rewards/margins": 1.1856746673583984, "rewards/rejected": 0.5102615356445312, "step": 1929 }, { "epoch": 1.04, "learning_rate": 8.673594569982438e-08, "logits/chosen": -2.0075087547302246, "logits/rejected": -2.251380681991577, "logps/chosen": -0.24313467741012573, "logps/rejected": -0.3002667725086212, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.8579821586608887, "rewards/margins": 0.017495036125183105, "rewards/rejected": 0.8404871225357056, "step": 1930 }, { "epoch": 1.04, "learning_rate": 8.67211280199034e-08, "logits/chosen": -2.144704580307007, "logits/rejected": -2.2264673709869385, "logps/chosen": -1.6831495761871338, "logps/rejected": -1.58433198928833, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.872026264667511, "rewards/margins": 0.015059292316436768, "rewards/rejected": 0.8569669723510742, "step": 1931 }, { "epoch": 1.04, "learning_rate": 8.670630333518414e-08, "logits/chosen": -2.173856258392334, "logits/rejected": -2.2518973350524902, "logps/chosen": -0.6165344715118408, "logps/rejected": -0.75123131275177, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.8483853340148926, "rewards/margins": 0.013161957263946533, "rewards/rejected": 0.835223376750946, "step": 1932 }, { "epoch": 1.04, "learning_rate": 8.669147164849455e-08, "logits/chosen": -2.128945827484131, "logits/rejected": -2.134945869445801, "logps/chosen": -1.2252103090286255, "logps/rejected": -2.116734504699707, "loss": 0.5212, "rewards/accuracies": 1.0, "rewards/chosen": 0.9209272265434265, "rewards/margins": 0.37965691089630127, "rewards/rejected": 0.5412703156471252, "step": 1933 }, { "epoch": 1.04, "learning_rate": 8.667663296266388e-08, "logits/chosen": -2.0252444744110107, "logits/rejected": -2.26179575920105, "logps/chosen": -0.5573475956916809, "logps/rejected": -0.6182291507720947, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.9142794609069824, "rewards/margins": 0.02677518129348755, "rewards/rejected": 0.8875042796134949, "step": 1934 }, { "epoch": 1.04, "learning_rate": 8.666178728052265e-08, "logits/chosen": -1.9486020803451538, "logits/rejected": -2.2722761631011963, "logps/chosen": -1.2304003238677979, "logps/rejected": -1.2338377237319946, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.926494300365448, "rewards/margins": 0.004363536834716797, "rewards/rejected": 0.9221307635307312, "step": 1935 }, { "epoch": 1.04, "learning_rate": 8.664693460490282e-08, "logits/chosen": -2.128113031387329, "logits/rejected": -2.129631280899048, "logps/chosen": -1.172943353652954, "logps/rejected": -2.0080809593200684, "loss": 0.5584, "rewards/accuracies": 1.0, "rewards/chosen": 1.0040180683135986, "rewards/margins": 0.29044634103775024, "rewards/rejected": 0.7135717272758484, "step": 1936 }, { "epoch": 1.04, "learning_rate": 8.663207493863764e-08, "logits/chosen": -2.1056976318359375, "logits/rejected": -2.2782015800476074, "logps/chosen": -2.1812288761138916, "logps/rejected": -2.059861898422241, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 1.0405879020690918, "rewards/margins": 0.00960695743560791, "rewards/rejected": 1.0309809446334839, "step": 1937 }, { "epoch": 1.05, "learning_rate": 8.661720828456165e-08, "logits/chosen": -2.1232352256774902, "logits/rejected": -2.2863590717315674, "logps/chosen": -3.630807638168335, "logps/rejected": -5.04011344909668, "loss": 0.7716, "rewards/accuracies": 0.0, "rewards/chosen": 1.023882269859314, "rewards/margins": -0.15124893188476562, "rewards/rejected": 1.1751312017440796, "step": 1938 }, { "epoch": 1.05, "learning_rate": 8.66023346455108e-08, "logits/chosen": -2.024580240249634, "logits/rejected": -2.033195734024048, "logps/chosen": -3.0841777324676514, "logps/rejected": -2.1395974159240723, "loss": 0.4979, "rewards/accuracies": 1.0, "rewards/chosen": 0.9450832605361938, "rewards/margins": 0.4380388855934143, "rewards/rejected": 0.5070443749427795, "step": 1939 }, { "epoch": 1.05, "learning_rate": 8.658745402432231e-08, "logits/chosen": -2.039001703262329, "logits/rejected": -2.2206509113311768, "logps/chosen": -0.4733923375606537, "logps/rejected": -0.5131653547286987, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.7942258715629578, "rewards/margins": 0.019196748733520508, "rewards/rejected": 0.7750291228294373, "step": 1940 }, { "epoch": 1.05, "learning_rate": 8.657256642383476e-08, "logits/chosen": -1.994081735610962, "logits/rejected": -1.9815346002578735, "logps/chosen": -11.680598258972168, "logps/rejected": -7.095104217529297, "loss": 0.2845, "rewards/accuracies": 1.0, "rewards/chosen": 1.9385753870010376, "rewards/margins": 1.1113715171813965, "rewards/rejected": 0.8272039294242859, "step": 1941 }, { "epoch": 1.05, "learning_rate": 8.655767184688809e-08, "logits/chosen": -2.0355641841888428, "logits/rejected": -2.0213675498962402, "logps/chosen": -15.287232398986816, "logps/rejected": -4.363402843475342, "loss": 0.609, "rewards/accuracies": 1.0, "rewards/chosen": 1.2534409761428833, "rewards/margins": 0.17605280876159668, "rewards/rejected": 1.0773881673812866, "step": 1942 }, { "epoch": 1.05, "learning_rate": 8.654277029632351e-08, "logits/chosen": -2.1132125854492188, "logits/rejected": -2.238976240158081, "logps/chosen": -0.25648778676986694, "logps/rejected": -0.2584116756916046, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8191022276878357, "rewards/margins": 0.022648215293884277, "rewards/rejected": 0.7964540123939514, "step": 1943 }, { "epoch": 1.05, "learning_rate": 8.652786177498359e-08, "logits/chosen": -2.00384521484375, "logits/rejected": -2.016458034515381, "logps/chosen": -3.3930084705352783, "logps/rejected": -1.457250952720642, "loss": 0.5844, "rewards/accuracies": 1.0, "rewards/chosen": 0.8838008046150208, "rewards/margins": 0.23087161779403687, "rewards/rejected": 0.6529291868209839, "step": 1944 }, { "epoch": 1.05, "learning_rate": 8.651294628571222e-08, "logits/chosen": -2.126709222793579, "logits/rejected": -2.0458931922912598, "logps/chosen": -26.35336685180664, "logps/rejected": -4.870044708251953, "loss": 0.4974, "rewards/accuracies": 1.0, "rewards/chosen": 1.2301517724990845, "rewards/margins": 0.43927615880966187, "rewards/rejected": 0.7908756136894226, "step": 1945 }, { "epoch": 1.05, "learning_rate": 8.649802383135465e-08, "logits/chosen": -2.054844617843628, "logits/rejected": -2.230316400527954, "logps/chosen": -0.4906218349933624, "logps/rejected": -0.5081292390823364, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.8598695993423462, "rewards/margins": 0.021800696849822998, "rewards/rejected": 0.8380689024925232, "step": 1946 }, { "epoch": 1.05, "learning_rate": 8.648309441475744e-08, "logits/chosen": -1.9528040885925293, "logits/rejected": -2.2277512550354004, "logps/chosen": -0.9425740242004395, "logps/rejected": -1.053191065788269, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 1.0292538404464722, "rewards/margins": -0.014382123947143555, "rewards/rejected": 1.0436359643936157, "step": 1947 }, { "epoch": 1.05, "learning_rate": 8.646815803876845e-08, "logits/chosen": -2.035508871078491, "logits/rejected": -2.280484676361084, "logps/chosen": -0.5109132528305054, "logps/rejected": -0.5735477209091187, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.9685226678848267, "rewards/margins": 0.03785747289657593, "rewards/rejected": 0.9306651949882507, "step": 1948 }, { "epoch": 1.05, "learning_rate": 8.645321470623692e-08, "logits/chosen": -1.9631394147872925, "logits/rejected": -2.2405173778533936, "logps/chosen": -0.494486540555954, "logps/rejected": -0.5089428424835205, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.7675718665122986, "rewards/margins": -0.0014011859893798828, "rewards/rejected": 0.7689730525016785, "step": 1949 }, { "epoch": 1.05, "learning_rate": 8.643826442001336e-08, "logits/chosen": -2.181094169616699, "logits/rejected": -2.294055700302124, "logps/chosen": -3.901202917098999, "logps/rejected": -1.6384626626968384, "loss": 0.7668, "rewards/accuracies": 0.0, "rewards/chosen": 0.7049630284309387, "rewards/margins": -0.142316997051239, "rewards/rejected": 0.8472800254821777, "step": 1950 }, { "epoch": 1.05, "learning_rate": 8.642330718294966e-08, "logits/chosen": -2.096205472946167, "logits/rejected": -2.1008646488189697, "logps/chosen": -3.7652587890625, "logps/rejected": -6.045621395111084, "loss": 0.5463, "rewards/accuracies": 1.0, "rewards/chosen": 0.9862634539604187, "rewards/margins": 0.3190177083015442, "rewards/rejected": 0.6672457456588745, "step": 1951 }, { "epoch": 1.05, "learning_rate": 8.6408342997899e-08, "logits/chosen": -2.1855366230010986, "logits/rejected": -2.2511794567108154, "logps/chosen": -3.7836837768554688, "logps/rejected": -12.200966835021973, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 1.303338885307312, "rewards/margins": 0.4817348122596741, "rewards/rejected": 0.8216040730476379, "step": 1952 }, { "epoch": 1.05, "learning_rate": 8.639337186771591e-08, "logits/chosen": -2.0886800289154053, "logits/rejected": -2.2759249210357666, "logps/chosen": -0.544526219367981, "logps/rejected": -0.5464910268783569, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.9154645800590515, "rewards/margins": 0.016716182231903076, "rewards/rejected": 0.8987483978271484, "step": 1953 }, { "epoch": 1.05, "learning_rate": 8.637839379525622e-08, "logits/chosen": -2.109724760055542, "logits/rejected": -2.04597806930542, "logps/chosen": -29.419384002685547, "logps/rejected": -2.0085346698760986, "loss": 0.2849, "rewards/accuracies": 1.0, "rewards/chosen": 1.9225491285324097, "rewards/margins": 1.1098716259002686, "rewards/rejected": 0.8126775026321411, "step": 1954 }, { "epoch": 1.05, "learning_rate": 8.636340878337711e-08, "logits/chosen": -2.1153857707977295, "logits/rejected": -2.1108057498931885, "logps/chosen": -4.779621601104736, "logps/rejected": -4.49345588684082, "loss": 0.4336, "rewards/accuracies": 1.0, "rewards/chosen": 1.0831629037857056, "rewards/margins": 0.6111512184143066, "rewards/rejected": 0.47201165556907654, "step": 1955 }, { "epoch": 1.06, "learning_rate": 8.634841683493704e-08, "logits/chosen": -2.0350770950317383, "logits/rejected": -2.0438907146453857, "logps/chosen": -4.255612850189209, "logps/rejected": -9.338287353515625, "loss": 0.4369, "rewards/accuracies": 1.0, "rewards/chosen": 1.144926905632019, "rewards/margins": 0.6017972826957703, "rewards/rejected": 0.5431296229362488, "step": 1956 }, { "epoch": 1.06, "learning_rate": 8.633341795279585e-08, "logits/chosen": -2.0423364639282227, "logits/rejected": -2.2333664894104004, "logps/chosen": -7.486518383026123, "logps/rejected": -5.113576889038086, "loss": 0.7475, "rewards/accuracies": 0.0, "rewards/chosen": 0.9712200164794922, "rewards/margins": -0.1058429479598999, "rewards/rejected": 1.077062964439392, "step": 1957 }, { "epoch": 1.06, "learning_rate": 8.631841213981468e-08, "logits/chosen": -2.119541645050049, "logits/rejected": -2.2787058353424072, "logps/chosen": -0.6635995507240295, "logps/rejected": -0.7084815502166748, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9305704236030579, "rewards/margins": 0.008977413177490234, "rewards/rejected": 0.9215930104255676, "step": 1958 }, { "epoch": 1.06, "learning_rate": 8.630339939885596e-08, "logits/chosen": -2.0971522331237793, "logits/rejected": -2.2955100536346436, "logps/chosen": -4.404913902282715, "logps/rejected": -3.917914867401123, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.9120965003967285, "rewards/margins": -0.01487874984741211, "rewards/rejected": 0.9269752502441406, "step": 1959 }, { "epoch": 1.06, "learning_rate": 8.628837973278351e-08, "logits/chosen": -2.00789737701416, "logits/rejected": -2.2202329635620117, "logps/chosen": -0.34022676944732666, "logps/rejected": -0.36947810649871826, "loss": 0.6688, "rewards/accuracies": 1.0, "rewards/chosen": 1.0160499811172485, "rewards/margins": 0.049363791942596436, "rewards/rejected": 0.9666861891746521, "step": 1960 }, { "epoch": 1.06, "learning_rate": 8.627335314446239e-08, "logits/chosen": -2.1656084060668945, "logits/rejected": -2.15525221824646, "logps/chosen": -3.0510103702545166, "logps/rejected": -3.210151433944702, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 1.1612604856491089, "rewards/margins": 0.6094966530799866, "rewards/rejected": 0.5517638325691223, "step": 1961 }, { "epoch": 1.06, "learning_rate": 8.625831963675905e-08, "logits/chosen": -2.0739474296569824, "logits/rejected": -2.0752620697021484, "logps/chosen": -2.694490909576416, "logps/rejected": -3.957369565963745, "loss": 0.3145, "rewards/accuracies": 1.0, "rewards/chosen": 1.522166132926941, "rewards/margins": 0.9952322840690613, "rewards/rejected": 0.5269338488578796, "step": 1962 }, { "epoch": 1.06, "learning_rate": 8.624327921254122e-08, "logits/chosen": -2.15297794342041, "logits/rejected": -2.149333953857422, "logps/chosen": -7.362093448638916, "logps/rejected": -3.1127893924713135, "loss": 0.475, "rewards/accuracies": 1.0, "rewards/chosen": 1.202467918395996, "rewards/margins": 0.4974824786186218, "rewards/rejected": 0.7049854397773743, "step": 1963 }, { "epoch": 1.06, "learning_rate": 8.622823187467796e-08, "logits/chosen": -2.0392298698425293, "logits/rejected": -2.043733835220337, "logps/chosen": -0.9066190719604492, "logps/rejected": -1.9482481479644775, "loss": 0.5174, "rewards/accuracies": 1.0, "rewards/chosen": 1.0451823472976685, "rewards/margins": 0.3890450596809387, "rewards/rejected": 0.6561372876167297, "step": 1964 }, { "epoch": 1.06, "learning_rate": 8.621317762603964e-08, "logits/chosen": -2.054746389389038, "logits/rejected": -2.0593080520629883, "logps/chosen": -0.6370375156402588, "logps/rejected": -12.032659530639648, "loss": 0.5187, "rewards/accuracies": 1.0, "rewards/chosen": 1.0274404287338257, "rewards/margins": 0.3859771490097046, "rewards/rejected": 0.6414632797241211, "step": 1965 }, { "epoch": 1.06, "learning_rate": 8.6198116469498e-08, "logits/chosen": -2.051285982131958, "logits/rejected": -2.244083881378174, "logps/chosen": -0.7166513204574585, "logps/rejected": -0.7389671802520752, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.9190487265586853, "rewards/margins": -0.006922304630279541, "rewards/rejected": 0.9259710311889648, "step": 1966 }, { "epoch": 1.06, "learning_rate": 8.618304840792599e-08, "logits/chosen": -2.0099294185638428, "logits/rejected": -2.2783472537994385, "logps/chosen": -0.7289342880249023, "logps/rejected": -0.8485254049301147, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.9985448122024536, "rewards/margins": 0.0018270611763000488, "rewards/rejected": 0.9967177510261536, "step": 1967 }, { "epoch": 1.06, "learning_rate": 8.616797344419798e-08, "logits/chosen": -2.0281639099121094, "logits/rejected": -2.2650914192199707, "logps/chosen": -2.202629566192627, "logps/rejected": -6.652615547180176, "loss": 0.5889, "rewards/accuracies": 1.0, "rewards/chosen": 1.1818668842315674, "rewards/margins": 0.22053444385528564, "rewards/rejected": 0.9613324403762817, "step": 1968 }, { "epoch": 1.06, "learning_rate": 8.615289158118962e-08, "logits/chosen": -2.091937303543091, "logits/rejected": -2.1412012577056885, "logps/chosen": -8.33085823059082, "logps/rejected": -8.736414909362793, "loss": 0.4394, "rewards/accuracies": 1.0, "rewards/chosen": 1.3646844625473022, "rewards/margins": 0.5946932435035706, "rewards/rejected": 0.7699912190437317, "step": 1969 }, { "epoch": 1.06, "learning_rate": 8.613780282177788e-08, "logits/chosen": -2.0914011001586914, "logits/rejected": -2.2803688049316406, "logps/chosen": -2.5280301570892334, "logps/rejected": -2.5091919898986816, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": 0.9658505320549011, "rewards/margins": 0.04461640119552612, "rewards/rejected": 0.921234130859375, "step": 1970 }, { "epoch": 1.06, "learning_rate": 8.612270716884103e-08, "logits/chosen": -2.137148380279541, "logits/rejected": -2.0512349605560303, "logps/chosen": -19.95565414428711, "logps/rejected": -1.954587459564209, "loss": 0.3578, "rewards/accuracies": 1.0, "rewards/chosen": 1.6473270654678345, "rewards/margins": 0.843623161315918, "rewards/rejected": 0.8037039041519165, "step": 1971 }, { "epoch": 1.06, "learning_rate": 8.610760462525865e-08, "logits/chosen": -2.08644437789917, "logits/rejected": -2.2810111045837402, "logps/chosen": -8.319440841674805, "logps/rejected": -8.060951232910156, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 1.214543342590332, "rewards/margins": 0.021854400634765625, "rewards/rejected": 1.1926889419555664, "step": 1972 }, { "epoch": 1.06, "learning_rate": 8.609249519391167e-08, "logits/chosen": -2.122955322265625, "logits/rejected": -2.062498092651367, "logps/chosen": -31.60392951965332, "logps/rejected": -7.6136603355407715, "loss": 0.2931, "rewards/accuracies": 1.0, "rewards/chosen": 2.0143778324127197, "rewards/margins": 1.0769307613372803, "rewards/rejected": 0.9374470114707947, "step": 1973 }, { "epoch": 1.06, "learning_rate": 8.607737887768231e-08, "logits/chosen": -1.992496371269226, "logits/rejected": -2.3062756061553955, "logps/chosen": -0.8073508739471436, "logps/rejected": -0.8243100643157959, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.8743838667869568, "rewards/margins": 0.023649156093597412, "rewards/rejected": 0.8507347106933594, "step": 1974 }, { "epoch": 1.07, "learning_rate": 8.606225567945411e-08, "logits/chosen": -2.0222675800323486, "logits/rejected": -2.06693434715271, "logps/chosen": -6.2896928787231445, "logps/rejected": -18.06024169921875, "loss": 0.5259, "rewards/accuracies": 1.0, "rewards/chosen": 1.0677460432052612, "rewards/margins": 0.36810439825057983, "rewards/rejected": 0.6996416449546814, "step": 1975 }, { "epoch": 1.07, "learning_rate": 8.604712560211193e-08, "logits/chosen": -2.1156468391418457, "logits/rejected": -2.1104488372802734, "logps/chosen": -6.544407844543457, "logps/rejected": -4.497718811035156, "loss": 0.4262, "rewards/accuracies": 1.0, "rewards/chosen": 1.132935881614685, "rewards/margins": 0.6321876049041748, "rewards/rejected": 0.5007482767105103, "step": 1976 }, { "epoch": 1.07, "learning_rate": 8.603198864854192e-08, "logits/chosen": -2.100837230682373, "logits/rejected": -2.103259801864624, "logps/chosen": -0.7334333062171936, "logps/rejected": -8.471807479858398, "loss": 0.4079, "rewards/accuracies": 1.0, "rewards/chosen": 1.1782972812652588, "rewards/margins": 0.6858881711959839, "rewards/rejected": 0.4924091398715973, "step": 1977 }, { "epoch": 1.07, "learning_rate": 8.601684482163155e-08, "logits/chosen": -2.1340296268463135, "logits/rejected": -2.2547943592071533, "logps/chosen": -0.6576254367828369, "logps/rejected": -0.671493649482727, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.7653929591178894, "rewards/margins": 0.007490396499633789, "rewards/rejected": 0.7579025626182556, "step": 1978 }, { "epoch": 1.07, "learning_rate": 8.600169412426961e-08, "logits/chosen": -1.9599205255508423, "logits/rejected": -1.9744561910629272, "logps/chosen": -5.081261157989502, "logps/rejected": -7.731146812438965, "loss": 0.5245, "rewards/accuracies": 1.0, "rewards/chosen": 1.0446783304214478, "rewards/margins": 0.37160730361938477, "rewards/rejected": 0.673071026802063, "step": 1979 }, { "epoch": 1.07, "learning_rate": 8.59865365593462e-08, "logits/chosen": -1.9817161560058594, "logits/rejected": -1.9889708757400513, "logps/chosen": -1.869154930114746, "logps/rejected": -2.660325288772583, "loss": 0.5241, "rewards/accuracies": 1.0, "rewards/chosen": 1.0171624422073364, "rewards/margins": 0.3725771903991699, "rewards/rejected": 0.6445852518081665, "step": 1980 }, { "epoch": 1.07, "learning_rate": 8.597137212975271e-08, "logits/chosen": -2.18930721282959, "logits/rejected": -2.1696040630340576, "logps/chosen": -18.069692611694336, "logps/rejected": -1.7165875434875488, "loss": 0.4433, "rewards/accuracies": 1.0, "rewards/chosen": 1.2529455423355103, "rewards/margins": 0.5838028788566589, "rewards/rejected": 0.6691426634788513, "step": 1981 }, { "epoch": 1.07, "learning_rate": 8.595620083838187e-08, "logits/chosen": -2.1094648838043213, "logits/rejected": -2.3632326126098633, "logps/chosen": -0.7796955704689026, "logps/rejected": -0.7629479765892029, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9879465103149414, "rewards/margins": 0.009397566318511963, "rewards/rejected": 0.9785489439964294, "step": 1982 }, { "epoch": 1.07, "learning_rate": 8.59410226881277e-08, "logits/chosen": -2.1202034950256348, "logits/rejected": -2.1325695514678955, "logps/chosen": -4.939535617828369, "logps/rejected": -3.219693422317505, "loss": 0.4309, "rewards/accuracies": 1.0, "rewards/chosen": 1.260634422302246, "rewards/margins": 0.6185771822929382, "rewards/rejected": 0.6420572400093079, "step": 1983 }, { "epoch": 1.07, "learning_rate": 8.592583768188555e-08, "logits/chosen": -2.0569374561309814, "logits/rejected": -2.0859386920928955, "logps/chosen": -2.4875426292419434, "logps/rejected": -8.105222702026367, "loss": 0.3299, "rewards/accuracies": 1.0, "rewards/chosen": 1.6324251890182495, "rewards/margins": 0.9394129514694214, "rewards/rejected": 0.6930122375488281, "step": 1984 }, { "epoch": 1.07, "learning_rate": 8.591064582255203e-08, "logits/chosen": -2.1753177642822266, "logits/rejected": -2.039865016937256, "logps/chosen": -35.512271881103516, "logps/rejected": -5.179384708404541, "loss": 0.2717, "rewards/accuracies": 1.0, "rewards/chosen": 1.6507244110107422, "rewards/margins": 1.1639423370361328, "rewards/rejected": 0.486782044172287, "step": 1985 }, { "epoch": 1.07, "learning_rate": 8.589544711302512e-08, "logits/chosen": -2.118565320968628, "logits/rejected": -2.330467939376831, "logps/chosen": -0.11237238347530365, "logps/rejected": -0.10498902201652527, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.76169353723526, "rewards/margins": 0.014314353466033936, "rewards/rejected": 0.7473791837692261, "step": 1986 }, { "epoch": 1.07, "learning_rate": 8.588024155620404e-08, "logits/chosen": -2.073570489883423, "logits/rejected": -2.0749127864837646, "logps/chosen": -2.722625970840454, "logps/rejected": -0.8151346445083618, "loss": 0.581, "rewards/accuracies": 1.0, "rewards/chosen": 1.2889107465744019, "rewards/margins": 0.23841774463653564, "rewards/rejected": 1.0504930019378662, "step": 1987 }, { "epoch": 1.07, "learning_rate": 8.586502915498937e-08, "logits/chosen": -2.1301119327545166, "logits/rejected": -2.293626546859741, "logps/chosen": -3.652226448059082, "logps/rejected": -0.5211227536201477, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.9593791961669922, "rewards/margins": -0.017689228057861328, "rewards/rejected": 0.9770684242248535, "step": 1988 }, { "epoch": 1.07, "learning_rate": 8.584980991228297e-08, "logits/chosen": -2.086566686630249, "logits/rejected": -2.246086835861206, "logps/chosen": -0.7040973901748657, "logps/rejected": -0.7969474196434021, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 1.019832968711853, "rewards/margins": 0.006667017936706543, "rewards/rejected": 1.0131659507751465, "step": 1989 }, { "epoch": 1.07, "learning_rate": 8.583458383098802e-08, "logits/chosen": -2.1407573223114014, "logits/rejected": -2.135089159011841, "logps/chosen": -4.440193176269531, "logps/rejected": -4.108245849609375, "loss": 0.6349, "rewards/accuracies": 1.0, "rewards/chosen": 0.7834028601646423, "rewards/margins": 0.12011921405792236, "rewards/rejected": 0.66328364610672, "step": 1990 }, { "epoch": 1.07, "learning_rate": 8.581935091400898e-08, "logits/chosen": -2.2246224880218506, "logits/rejected": -2.2243316173553467, "logps/chosen": -1.5968081951141357, "logps/rejected": -1.3450560569763184, "loss": 0.5431, "rewards/accuracies": 1.0, "rewards/chosen": 1.010587215423584, "rewards/margins": 0.3265407681465149, "rewards/rejected": 0.6840464472770691, "step": 1991 }, { "epoch": 1.07, "learning_rate": 8.580411116425166e-08, "logits/chosen": -1.983967900276184, "logits/rejected": -2.2778282165527344, "logps/chosen": -0.7782469987869263, "logps/rejected": -0.835870087146759, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.8919375538825989, "rewards/margins": -0.003357231616973877, "rewards/rejected": 0.8952947854995728, "step": 1992 }, { "epoch": 1.07, "learning_rate": 8.57888645846231e-08, "logits/chosen": -2.237370252609253, "logits/rejected": -2.248420476913452, "logps/chosen": -5.571242809295654, "logps/rejected": -5.29451847076416, "loss": 0.4444, "rewards/accuracies": 1.0, "rewards/chosen": 1.0766849517822266, "rewards/margins": 0.5806678533554077, "rewards/rejected": 0.49601706862449646, "step": 1993 }, { "epoch": 1.08, "learning_rate": 8.577361117803173e-08, "logits/chosen": -2.192444086074829, "logits/rejected": -2.066929817199707, "logps/chosen": -35.608116149902344, "logps/rejected": -4.181114673614502, "loss": 0.2928, "rewards/accuracies": 1.0, "rewards/chosen": 1.7164421081542969, "rewards/margins": 1.0782719850540161, "rewards/rejected": 0.6381701231002808, "step": 1994 }, { "epoch": 1.08, "learning_rate": 8.575835094738722e-08, "logits/chosen": -2.1255362033843994, "logits/rejected": -2.274427652359009, "logps/chosen": -0.5605075359344482, "logps/rejected": -0.6476998329162598, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750562906265259, "rewards/margins": 0.005540072917938232, "rewards/rejected": 0.9695162177085876, "step": 1995 }, { "epoch": 1.08, "learning_rate": 8.574308389560054e-08, "logits/chosen": -2.0487427711486816, "logits/rejected": -2.052565097808838, "logps/chosen": -5.325922966003418, "logps/rejected": -3.615060806274414, "loss": 0.5699, "rewards/accuracies": 1.0, "rewards/chosen": 0.9173693656921387, "rewards/margins": 0.26387012004852295, "rewards/rejected": 0.6534992456436157, "step": 1996 }, { "epoch": 1.08, "learning_rate": 8.572781002558402e-08, "logits/chosen": -2.0456578731536865, "logits/rejected": -2.3245997428894043, "logps/chosen": -0.4120608866214752, "logps/rejected": -0.4084828495979309, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.986773669719696, "rewards/margins": 0.019038736820220947, "rewards/rejected": 0.9677349328994751, "step": 1997 }, { "epoch": 1.08, "learning_rate": 8.571252934025122e-08, "logits/chosen": -1.954866647720337, "logits/rejected": -2.268343687057495, "logps/chosen": -0.2679993510246277, "logps/rejected": -0.30142873525619507, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.9044243693351746, "rewards/margins": 0.02992349863052368, "rewards/rejected": 0.8745008707046509, "step": 1998 }, { "epoch": 1.08, "learning_rate": 8.569724184251703e-08, "logits/chosen": -2.3150506019592285, "logits/rejected": -2.198992967605591, "logps/chosen": -33.731040954589844, "logps/rejected": -2.724508762359619, "loss": 0.3108, "rewards/accuracies": 1.0, "rewards/chosen": 1.7799729108810425, "rewards/margins": 1.009070634841919, "rewards/rejected": 0.7709023356437683, "step": 1999 }, { "epoch": 1.08, "learning_rate": 8.568194753529767e-08, "logits/chosen": -1.9579102993011475, "logits/rejected": -1.958371877670288, "logps/chosen": -4.963815212249756, "logps/rejected": -0.37636682391166687, "loss": 0.5472, "rewards/accuracies": 1.0, "rewards/chosen": 1.2546011209487915, "rewards/margins": 0.3169165849685669, "rewards/rejected": 0.9376845359802246, "step": 2000 }, { "epoch": 1.08, "learning_rate": 8.56666464215106e-08, "logits/chosen": -2.031142473220825, "logits/rejected": -2.2655463218688965, "logps/chosen": -1.3054782152175903, "logps/rejected": -1.2260727882385254, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.7757079005241394, "rewards/margins": 0.013959348201751709, "rewards/rejected": 0.7617485523223877, "step": 2001 }, { "epoch": 1.08, "learning_rate": 8.565133850407464e-08, "logits/chosen": -2.101794958114624, "logits/rejected": -2.2493674755096436, "logps/chosen": -3.664522886276245, "logps/rejected": -4.543229579925537, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.7309147715568542, "rewards/margins": 0.006840527057647705, "rewards/rejected": 0.7240742444992065, "step": 2002 }, { "epoch": 1.08, "learning_rate": 8.563602378590986e-08, "logits/chosen": -2.0936381816864014, "logits/rejected": -2.0912351608276367, "logps/chosen": -3.0249059200286865, "logps/rejected": -4.7533345222473145, "loss": 0.2967, "rewards/accuracies": 1.0, "rewards/chosen": 1.5896347761154175, "rewards/margins": 1.0628414154052734, "rewards/rejected": 0.5267934203147888, "step": 2003 }, { "epoch": 1.08, "learning_rate": 8.562070226993762e-08, "logits/chosen": -2.1756715774536133, "logits/rejected": -2.1612401008605957, "logps/chosen": -2.843341112136841, "logps/rejected": -8.909053802490234, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 1.2419878244400024, "rewards/margins": 0.7613463401794434, "rewards/rejected": 0.4806414544582367, "step": 2004 }, { "epoch": 1.08, "learning_rate": 8.560537395908066e-08, "logits/chosen": -1.9728541374206543, "logits/rejected": -2.308567762374878, "logps/chosen": -0.7746270298957825, "logps/rejected": -0.7468571662902832, "loss": 0.7036, "rewards/accuracies": 0.0, "rewards/chosen": 0.7718831896781921, "rewards/margins": -0.020862102508544922, "rewards/rejected": 0.7927452921867371, "step": 2005 }, { "epoch": 1.08, "learning_rate": 8.559003885626289e-08, "logits/chosen": -2.106088399887085, "logits/rejected": -2.0935986042022705, "logps/chosen": -15.858522415161133, "logps/rejected": -5.78934383392334, "loss": 0.329, "rewards/accuracies": 1.0, "rewards/chosen": 1.3359003067016602, "rewards/margins": 0.9425264596939087, "rewards/rejected": 0.39337387681007385, "step": 2006 }, { "epoch": 1.08, "learning_rate": 8.557469696440962e-08, "logits/chosen": -2.1409099102020264, "logits/rejected": -2.1460556983947754, "logps/chosen": -1.9787628650665283, "logps/rejected": -3.252017021179199, "loss": 0.5304, "rewards/accuracies": 1.0, "rewards/chosen": 0.9879412651062012, "rewards/margins": 0.35725098848342896, "rewards/rejected": 0.6306902766227722, "step": 2007 }, { "epoch": 1.08, "learning_rate": 8.555934828644742e-08, "logits/chosen": -2.2215676307678223, "logits/rejected": -2.218761444091797, "logps/chosen": -7.790717124938965, "logps/rejected": -4.870436668395996, "loss": 0.3009, "rewards/accuracies": 1.0, "rewards/chosen": 1.4330525398254395, "rewards/margins": 1.0469269752502441, "rewards/rejected": 0.3861255645751953, "step": 2008 }, { "epoch": 1.08, "learning_rate": 8.554399282530413e-08, "logits/chosen": -2.0548057556152344, "logits/rejected": -2.2334413528442383, "logps/chosen": -0.3522961139678955, "logps/rejected": -0.34749889373779297, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.8632476925849915, "rewards/margins": 0.020298361778259277, "rewards/rejected": 0.8429493308067322, "step": 2009 }, { "epoch": 1.08, "learning_rate": 8.552863058390893e-08, "logits/chosen": -1.978265643119812, "logits/rejected": -1.9777380228042603, "logps/chosen": -0.4237757623195648, "logps/rejected": -3.957751989364624, "loss": 0.4765, "rewards/accuracies": 1.0, "rewards/chosen": 1.0378214120864868, "rewards/margins": 0.4935709238052368, "rewards/rejected": 0.54425048828125, "step": 2010 }, { "epoch": 1.08, "learning_rate": 8.551326156519226e-08, "logits/chosen": -1.986496090888977, "logits/rejected": -1.9867109060287476, "logps/chosen": -2.3077661991119385, "logps/rejected": -1.1661107540130615, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": 0.9818177223205566, "rewards/margins": -0.049091339111328125, "rewards/rejected": 1.0309090614318848, "step": 2011 }, { "epoch": 1.09, "learning_rate": 8.549788577208585e-08, "logits/chosen": -2.1394333839416504, "logits/rejected": -2.258615732192993, "logps/chosen": -1.472278356552124, "logps/rejected": -1.5485727787017822, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.6548185348510742, "rewards/margins": 0.011264681816101074, "rewards/rejected": 0.6435538530349731, "step": 2012 }, { "epoch": 1.09, "learning_rate": 8.548250320752275e-08, "logits/chosen": -2.07438588142395, "logits/rejected": -2.0745434761047363, "logps/chosen": -2.555185317993164, "logps/rejected": -1.5297174453735352, "loss": 0.6661, "rewards/accuracies": 1.0, "rewards/chosen": 0.996383786201477, "rewards/margins": 0.05484718084335327, "rewards/rejected": 0.9415366053581238, "step": 2013 }, { "epoch": 1.09, "learning_rate": 8.546711387443728e-08, "logits/chosen": -1.923697829246521, "logits/rejected": -1.9405676126480103, "logps/chosen": -3.407442331314087, "logps/rejected": -5.866057872772217, "loss": 0.5749, "rewards/accuracies": 1.0, "rewards/chosen": 1.0287272930145264, "rewards/margins": 0.25229978561401367, "rewards/rejected": 0.7764275074005127, "step": 2014 }, { "epoch": 1.09, "learning_rate": 8.545171777576505e-08, "logits/chosen": -1.9646642208099365, "logits/rejected": -1.965025782585144, "logps/chosen": -3.663350820541382, "logps/rejected": -1.7323757410049438, "loss": 0.4995, "rewards/accuracies": 1.0, "rewards/chosen": 1.2570585012435913, "rewards/margins": 0.43400686979293823, "rewards/rejected": 0.8230516314506531, "step": 2015 }, { "epoch": 1.09, "learning_rate": 8.543631491444299e-08, "logits/chosen": -2.019347667694092, "logits/rejected": -2.243525981903076, "logps/chosen": -0.5371336936950684, "logps/rejected": -0.49579042196273804, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 0.9339426159858704, "rewards/margins": -0.015359044075012207, "rewards/rejected": 0.9493016600608826, "step": 2016 }, { "epoch": 1.09, "learning_rate": 8.542090529340928e-08, "logits/chosen": -2.0104310512542725, "logits/rejected": -2.193265199661255, "logps/chosen": -6.131539344787598, "logps/rejected": -1.4734090566635132, "loss": 0.8216, "rewards/accuracies": 0.0, "rewards/chosen": 0.6416193842887878, "rewards/margins": -0.24233227968215942, "rewards/rejected": 0.8839516639709473, "step": 2017 }, { "epoch": 1.09, "learning_rate": 8.54054889156034e-08, "logits/chosen": -2.0227956771850586, "logits/rejected": -2.2781405448913574, "logps/chosen": -1.6262552738189697, "logps/rejected": -1.4464759826660156, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 1.0664533376693726, "rewards/margins": 0.003913164138793945, "rewards/rejected": 1.0625401735305786, "step": 2018 }, { "epoch": 1.09, "learning_rate": 8.539006578396615e-08, "logits/chosen": -2.098437786102295, "logits/rejected": -2.3133597373962402, "logps/chosen": -1.0904593467712402, "logps/rejected": -6.373824119567871, "loss": 0.5527, "rewards/accuracies": 1.0, "rewards/chosen": 0.9566835761070251, "rewards/margins": 0.30400192737579346, "rewards/rejected": 0.6526816487312317, "step": 2019 }, { "epoch": 1.09, "learning_rate": 8.537463590143956e-08, "logits/chosen": -2.137544870376587, "logits/rejected": -2.2685763835906982, "logps/chosen": -2.7297937870025635, "logps/rejected": -0.691024899482727, "loss": 0.7143, "rewards/accuracies": 0.0, "rewards/chosen": 0.8446208238601685, "rewards/margins": -0.04182630777359009, "rewards/rejected": 0.8864471316337585, "step": 2020 }, { "epoch": 1.09, "learning_rate": 8.535919927096702e-08, "logits/chosen": -2.005504846572876, "logits/rejected": -2.0122382640838623, "logps/chosen": -1.8183797597885132, "logps/rejected": -5.449829578399658, "loss": 0.3857, "rewards/accuracies": 1.0, "rewards/chosen": 1.0862383842468262, "rewards/margins": 0.7537472248077393, "rewards/rejected": 0.3324911594390869, "step": 2021 }, { "epoch": 1.09, "learning_rate": 8.534375589549314e-08, "logits/chosen": -2.0441017150878906, "logits/rejected": -2.046715497970581, "logps/chosen": -5.34261417388916, "logps/rejected": -0.6181104183197021, "loss": 0.5638, "rewards/accuracies": 1.0, "rewards/chosen": 1.212037205696106, "rewards/margins": 0.2779087424278259, "rewards/rejected": 0.93412846326828, "step": 2022 }, { "epoch": 1.09, "learning_rate": 8.532830577796387e-08, "logits/chosen": -2.029761791229248, "logits/rejected": -2.0387210845947266, "logps/chosen": -2.4303622245788574, "logps/rejected": -1.8925851583480835, "loss": 0.3422, "rewards/accuracies": 1.0, "rewards/chosen": 1.5904426574707031, "rewards/margins": 0.8964785933494568, "rewards/rejected": 0.6939640641212463, "step": 2023 }, { "epoch": 1.09, "learning_rate": 8.531284892132641e-08, "logits/chosen": -2.145646572113037, "logits/rejected": -2.1498684883117676, "logps/chosen": -0.5804446339607239, "logps/rejected": -6.559309482574463, "loss": 0.4802, "rewards/accuracies": 1.0, "rewards/chosen": 0.9787607192993164, "rewards/margins": 0.48399224877357483, "rewards/rejected": 0.4947684705257416, "step": 2024 }, { "epoch": 1.09, "learning_rate": 8.529738532852925e-08, "logits/chosen": -2.0917046070098877, "logits/rejected": -2.085259437561035, "logps/chosen": -4.53114128112793, "logps/rejected": -5.0346269607543945, "loss": 0.4089, "rewards/accuracies": 1.0, "rewards/chosen": 1.1600370407104492, "rewards/margins": 0.6827296018600464, "rewards/rejected": 0.47730740904808044, "step": 2025 }, { "epoch": 1.09, "learning_rate": 8.52819150025222e-08, "logits/chosen": -2.04642915725708, "logits/rejected": -2.044232130050659, "logps/chosen": -5.080023765563965, "logps/rejected": -6.798667907714844, "loss": 0.5149, "rewards/accuracies": 1.0, "rewards/chosen": 1.3105906248092651, "rewards/margins": 0.3953341841697693, "rewards/rejected": 0.9152564406394958, "step": 2026 }, { "epoch": 1.09, "learning_rate": 8.526643794625628e-08, "logits/chosen": -2.03113055229187, "logits/rejected": -2.1091468334198, "logps/chosen": -2.829771041870117, "logps/rejected": -11.323832511901855, "loss": 0.3345, "rewards/accuracies": 1.0, "rewards/chosen": 1.4501618146896362, "rewards/margins": 0.9231114983558655, "rewards/rejected": 0.5270503163337708, "step": 2027 }, { "epoch": 1.09, "learning_rate": 8.525095416268388e-08, "logits/chosen": -2.0253336429595947, "logits/rejected": -2.2543752193450928, "logps/chosen": -7.125328063964844, "logps/rejected": -4.364137649536133, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8114307522773743, "rewards/margins": 0.0017150044441223145, "rewards/rejected": 0.809715747833252, "step": 2028 }, { "epoch": 1.09, "learning_rate": 8.523546365475863e-08, "logits/chosen": -2.137662649154663, "logits/rejected": -2.287243366241455, "logps/chosen": -1.2801040410995483, "logps/rejected": -4.44661283493042, "loss": 0.6655, "rewards/accuracies": 1.0, "rewards/chosen": 0.9376904368400574, "rewards/margins": 0.05604809522628784, "rewards/rejected": 0.8816423416137695, "step": 2029 }, { "epoch": 1.09, "learning_rate": 8.521996642543543e-08, "logits/chosen": -2.112027168273926, "logits/rejected": -2.1059670448303223, "logps/chosen": -1.0586042404174805, "logps/rejected": -11.702726364135742, "loss": 0.4476, "rewards/accuracies": 1.0, "rewards/chosen": 1.1918214559555054, "rewards/margins": 0.5716695189476013, "rewards/rejected": 0.620151937007904, "step": 2030 }, { "epoch": 1.1, "learning_rate": 8.52044624776705e-08, "logits/chosen": -1.9584393501281738, "logits/rejected": -1.9766780138015747, "logps/chosen": -2.17856764793396, "logps/rejected": -13.525811195373535, "loss": 0.7725, "rewards/accuracies": 0.0, "rewards/chosen": 0.7887327075004578, "rewards/margins": -0.15281689167022705, "rewards/rejected": 0.9415495991706848, "step": 2031 }, { "epoch": 1.1, "learning_rate": 8.518895181442132e-08, "logits/chosen": -1.975304126739502, "logits/rejected": -1.9821853637695312, "logps/chosen": -1.8816910982131958, "logps/rejected": -3.181967258453369, "loss": 0.4669, "rewards/accuracies": 1.0, "rewards/chosen": 1.0613243579864502, "rewards/margins": 0.5192505717277527, "rewards/rejected": 0.5420737862586975, "step": 2032 }, { "epoch": 1.1, "learning_rate": 8.517343443864662e-08, "logits/chosen": -2.0867230892181396, "logits/rejected": -2.2702414989471436, "logps/chosen": -1.3970520496368408, "logps/rejected": -1.2987242937088013, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.9995101094245911, "rewards/margins": 0.03653746843338013, "rewards/rejected": 0.9629726409912109, "step": 2033 }, { "epoch": 1.1, "learning_rate": 8.515791035330646e-08, "logits/chosen": -2.136068105697632, "logits/rejected": -2.3164331912994385, "logps/chosen": -0.3827362358570099, "logps/rejected": -0.37303614616394043, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.8482772707939148, "rewards/margins": 0.0072786808013916016, "rewards/rejected": 0.8409985899925232, "step": 2034 }, { "epoch": 1.1, "learning_rate": 8.514237956136219e-08, "logits/chosen": -1.9889144897460938, "logits/rejected": -1.9893189668655396, "logps/chosen": -3.4153661727905273, "logps/rejected": -2.959432601928711, "loss": 0.3757, "rewards/accuracies": 1.0, "rewards/chosen": 1.4747220277786255, "rewards/margins": 0.7850847244262695, "rewards/rejected": 0.689637303352356, "step": 2035 }, { "epoch": 1.1, "learning_rate": 8.512684206577637e-08, "logits/chosen": -2.1737868785858154, "logits/rejected": -2.1629714965820312, "logps/chosen": -3.718402862548828, "logps/rejected": -9.691679954528809, "loss": 0.6185, "rewards/accuracies": 1.0, "rewards/chosen": 0.8253950476646423, "rewards/margins": 0.1553635597229004, "rewards/rejected": 0.6700314879417419, "step": 2036 }, { "epoch": 1.1, "learning_rate": 8.51112978695129e-08, "logits/chosen": -2.0557422637939453, "logits/rejected": -2.063886880874634, "logps/chosen": -2.693856716156006, "logps/rejected": -4.514512062072754, "loss": 0.5, "rewards/accuracies": 1.0, "rewards/chosen": 0.9461812973022461, "rewards/margins": 0.43265074491500854, "rewards/rejected": 0.5135305523872375, "step": 2037 }, { "epoch": 1.1, "learning_rate": 8.509574697553694e-08, "logits/chosen": -2.1487514972686768, "logits/rejected": -2.150017261505127, "logps/chosen": -5.867814540863037, "logps/rejected": -10.660845756530762, "loss": 0.2237, "rewards/accuracies": 1.0, "rewards/chosen": 1.7625274658203125, "rewards/margins": 1.3835937976837158, "rewards/rejected": 0.3789336383342743, "step": 2038 }, { "epoch": 1.1, "learning_rate": 8.508018938681492e-08, "logits/chosen": -2.093869924545288, "logits/rejected": -2.101417303085327, "logps/chosen": -1.5929707288742065, "logps/rejected": -3.839386463165283, "loss": 0.4423, "rewards/accuracies": 1.0, "rewards/chosen": 1.0695174932479858, "rewards/margins": 0.5863583087921143, "rewards/rejected": 0.48315921425819397, "step": 2039 }, { "epoch": 1.1, "learning_rate": 8.506462510631455e-08, "logits/chosen": -2.073840379714966, "logits/rejected": -2.2324306964874268, "logps/chosen": -0.5984092950820923, "logps/rejected": -0.6110574007034302, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.9599607586860657, "rewards/margins": 0.0023347139358520508, "rewards/rejected": 0.9576260447502136, "step": 2040 }, { "epoch": 1.1, "learning_rate": 8.504905413700483e-08, "logits/chosen": -2.1269466876983643, "logits/rejected": -2.2546634674072266, "logps/chosen": -3.8409903049468994, "logps/rejected": -3.86857533454895, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.8334373831748962, "rewards/margins": -0.007486999034881592, "rewards/rejected": 0.8409243822097778, "step": 2041 }, { "epoch": 1.1, "learning_rate": 8.503347648185601e-08, "logits/chosen": -2.1700849533081055, "logits/rejected": -2.1662607192993164, "logps/chosen": -6.6811842918396, "logps/rejected": -4.8855509757995605, "loss": 0.3696, "rewards/accuracies": 1.0, "rewards/chosen": 1.2685655355453491, "rewards/margins": 0.8048893213272095, "rewards/rejected": 0.46367621421813965, "step": 2042 }, { "epoch": 1.1, "learning_rate": 8.501789214383965e-08, "logits/chosen": -2.000765800476074, "logits/rejected": -2.253352403640747, "logps/chosen": -5.4182538986206055, "logps/rejected": -3.372408866882324, "loss": 0.7412, "rewards/accuracies": 0.0, "rewards/chosen": 0.5090016722679138, "rewards/margins": -0.09387922286987305, "rewards/rejected": 0.6028808951377869, "step": 2043 }, { "epoch": 1.1, "learning_rate": 8.500230112592855e-08, "logits/chosen": -2.1430234909057617, "logits/rejected": -2.0081770420074463, "logps/chosen": -33.2080192565918, "logps/rejected": -5.2276177406311035, "loss": 0.2785, "rewards/accuracies": 1.0, "rewards/chosen": 1.5377769470214844, "rewards/margins": 1.1360441446304321, "rewards/rejected": 0.40173277258872986, "step": 2044 }, { "epoch": 1.1, "learning_rate": 8.498670343109682e-08, "logits/chosen": -1.9230880737304688, "logits/rejected": -1.9229720830917358, "logps/chosen": -1.2295300960540771, "logps/rejected": -1.0938459634780884, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.8173586130142212, "rewards/margins": 0.031289756298065186, "rewards/rejected": 0.786068856716156, "step": 2045 }, { "epoch": 1.1, "learning_rate": 8.497109906231979e-08, "logits/chosen": -2.1671385765075684, "logits/rejected": -2.2880334854125977, "logps/chosen": -1.3651602268218994, "logps/rejected": -7.1535444259643555, "loss": 0.5681, "rewards/accuracies": 1.0, "rewards/chosen": 1.0535242557525635, "rewards/margins": 0.2679522633552551, "rewards/rejected": 0.7855719923973083, "step": 2046 }, { "epoch": 1.1, "learning_rate": 8.495548802257413e-08, "logits/chosen": -2.07353138923645, "logits/rejected": -2.285804510116577, "logps/chosen": -6.930627822875977, "logps/rejected": -5.994990348815918, "loss": 0.7108, "rewards/accuracies": 0.0, "rewards/chosen": 0.7954103350639343, "rewards/margins": -0.03494161367416382, "rewards/rejected": 0.8303519487380981, "step": 2047 }, { "epoch": 1.1, "learning_rate": 8.493987031483774e-08, "logits/chosen": -2.0246317386627197, "logits/rejected": -2.031230926513672, "logps/chosen": -1.8785068988800049, "logps/rejected": -3.995959997177124, "loss": 0.4671, "rewards/accuracies": 1.0, "rewards/chosen": 1.0889486074447632, "rewards/margins": 0.5185617804527283, "rewards/rejected": 0.5703868269920349, "step": 2048 }, { "epoch": 1.11, "learning_rate": 8.492424594208979e-08, "logits/chosen": -2.135277509689331, "logits/rejected": -2.1468088626861572, "logps/chosen": -3.2388811111450195, "logps/rejected": -3.3046913146972656, "loss": 0.5778, "rewards/accuracies": 1.0, "rewards/chosen": 1.298859715461731, "rewards/margins": 0.2456897497177124, "rewards/rejected": 1.0531699657440186, "step": 2049 }, { "epoch": 1.11, "learning_rate": 8.490861490731074e-08, "logits/chosen": -2.0811595916748047, "logits/rejected": -2.084487199783325, "logps/chosen": -1.839591145515442, "logps/rejected": -2.7275519371032715, "loss": 0.4545, "rewards/accuracies": 1.0, "rewards/chosen": 1.3631141185760498, "rewards/margins": 0.5527108311653137, "rewards/rejected": 0.8104032874107361, "step": 2050 }, { "epoch": 1.11, "learning_rate": 8.48929772134823e-08, "logits/chosen": -2.1829278469085693, "logits/rejected": -2.2563395500183105, "logps/chosen": -4.282177448272705, "logps/rejected": -26.749908447265625, "loss": 0.3886, "rewards/accuracies": 1.0, "rewards/chosen": 1.0946117639541626, "rewards/margins": 0.7446788549423218, "rewards/rejected": 0.34993287920951843, "step": 2051 }, { "epoch": 1.11, "learning_rate": 8.487733286358749e-08, "logits/chosen": -2.0552501678466797, "logits/rejected": -2.1006250381469727, "logps/chosen": -4.339585781097412, "logps/rejected": -25.532894134521484, "loss": 0.188, "rewards/accuracies": 1.0, "rewards/chosen": 1.2651197910308838, "rewards/margins": 1.5759247541427612, "rewards/rejected": -0.31080493330955505, "step": 2052 }, { "epoch": 1.11, "learning_rate": 8.486168186061056e-08, "logits/chosen": -2.1104624271392822, "logits/rejected": -2.2590765953063965, "logps/chosen": -2.5265274047851562, "logps/rejected": -1.4553430080413818, "loss": 0.6583, "rewards/accuracies": 1.0, "rewards/chosen": 1.011544108390808, "rewards/margins": 0.07089114189147949, "rewards/rejected": 0.9406529664993286, "step": 2053 }, { "epoch": 1.11, "learning_rate": 8.484602420753703e-08, "logits/chosen": -2.038498640060425, "logits/rejected": -2.042698383331299, "logps/chosen": -1.387068271636963, "logps/rejected": -3.5790607929229736, "loss": 0.4595, "rewards/accuracies": 1.0, "rewards/chosen": 1.1070492267608643, "rewards/margins": 0.5391797423362732, "rewards/rejected": 0.5678694844245911, "step": 2054 }, { "epoch": 1.11, "learning_rate": 8.483035990735371e-08, "logits/chosen": -1.994673490524292, "logits/rejected": -2.031080484390259, "logps/chosen": -8.632500648498535, "logps/rejected": -18.11070442199707, "loss": 0.3932, "rewards/accuracies": 1.0, "rewards/chosen": 1.16353178024292, "rewards/margins": 0.7302544116973877, "rewards/rejected": 0.43327733874320984, "step": 2055 }, { "epoch": 1.11, "learning_rate": 8.481468896304868e-08, "logits/chosen": -2.0739264488220215, "logits/rejected": -2.0856761932373047, "logps/chosen": -9.940677642822266, "logps/rejected": -6.566580772399902, "loss": 0.265, "rewards/accuracies": 1.0, "rewards/chosen": 1.9005831480026245, "rewards/margins": 1.1926417350769043, "rewards/rejected": 0.7079413533210754, "step": 2056 }, { "epoch": 1.11, "learning_rate": 8.479901137761128e-08, "logits/chosen": -2.100933074951172, "logits/rejected": -2.3309693336486816, "logps/chosen": -12.961618423461914, "logps/rejected": -8.258136749267578, "loss": 0.8293, "rewards/accuracies": 0.0, "rewards/chosen": 0.8875837326049805, "rewards/margins": -0.2560030221939087, "rewards/rejected": 1.1435867547988892, "step": 2057 }, { "epoch": 1.11, "learning_rate": 8.478332715403209e-08, "logits/chosen": -1.9782496690750122, "logits/rejected": -1.976302146911621, "logps/chosen": -2.718168258666992, "logps/rejected": -7.42377233505249, "loss": 0.4895, "rewards/accuracies": 1.0, "rewards/chosen": 1.018919587135315, "rewards/margins": 0.45972877740859985, "rewards/rejected": 0.5591908097267151, "step": 2058 }, { "epoch": 1.11, "learning_rate": 8.476763629530299e-08, "logits/chosen": -2.1383957862854004, "logits/rejected": -2.026195764541626, "logps/chosen": -10.03921890258789, "logps/rejected": -2.500387668609619, "loss": 0.3354, "rewards/accuracies": 1.0, "rewards/chosen": 1.6839083433151245, "rewards/margins": 0.92005854845047, "rewards/rejected": 0.7638497948646545, "step": 2059 }, { "epoch": 1.11, "learning_rate": 8.475193880441714e-08, "logits/chosen": -2.0886106491088867, "logits/rejected": -2.098980188369751, "logps/chosen": -3.047140598297119, "logps/rejected": -6.209505081176758, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9891918301582336, "rewards/margins": 0.005506336688995361, "rewards/rejected": 0.9836854934692383, "step": 2060 }, { "epoch": 1.11, "learning_rate": 8.473623468436887e-08, "logits/chosen": -2.094276189804077, "logits/rejected": -2.1028127670288086, "logps/chosen": -1.7288216352462769, "logps/rejected": -3.345005750656128, "loss": 0.4436, "rewards/accuracies": 1.0, "rewards/chosen": 1.210675597190857, "rewards/margins": 0.5828418135643005, "rewards/rejected": 0.6278337836265564, "step": 2061 }, { "epoch": 1.11, "learning_rate": 8.472052393815392e-08, "logits/chosen": -2.1372506618499756, "logits/rejected": -2.1332156658172607, "logps/chosen": -2.7869315147399902, "logps/rejected": -3.0679125785827637, "loss": 0.3944, "rewards/accuracies": 1.0, "rewards/chosen": 1.5282560586929321, "rewards/margins": 0.7266742587089539, "rewards/rejected": 0.8015817999839783, "step": 2062 }, { "epoch": 1.11, "learning_rate": 8.470480656876919e-08, "logits/chosen": -2.0429234504699707, "logits/rejected": -2.2406673431396484, "logps/chosen": -1.6571375131607056, "logps/rejected": -1.457618236541748, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 0.729537844657898, "rewards/margins": -0.007003366947174072, "rewards/rejected": 0.736541211605072, "step": 2063 }, { "epoch": 1.11, "learning_rate": 8.468908257921287e-08, "logits/chosen": -2.126707077026367, "logits/rejected": -2.1207683086395264, "logps/chosen": -3.6968486309051514, "logps/rejected": -3.5547361373901367, "loss": 0.417, "rewards/accuracies": 1.0, "rewards/chosen": 1.1185609102249146, "rewards/margins": 0.6588708162307739, "rewards/rejected": 0.4596900939941406, "step": 2064 }, { "epoch": 1.11, "learning_rate": 8.467335197248441e-08, "logits/chosen": -2.0519044399261475, "logits/rejected": -2.2788188457489014, "logps/chosen": -0.9254664778709412, "logps/rejected": -0.9246892929077148, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.7555902600288391, "rewards/margins": 0.01271963119506836, "rewards/rejected": 0.7428706288337708, "step": 2065 }, { "epoch": 1.11, "learning_rate": 8.465761475158454e-08, "logits/chosen": -2.063878297805786, "logits/rejected": -2.05552339553833, "logps/chosen": -5.530171871185303, "logps/rejected": -1.051784634590149, "loss": 0.4808, "rewards/accuracies": 1.0, "rewards/chosen": 1.4702695608139038, "rewards/margins": 0.4822348952293396, "rewards/rejected": 0.9880346655845642, "step": 2066 }, { "epoch": 1.11, "learning_rate": 8.464187091951523e-08, "logits/chosen": -2.1273772716522217, "logits/rejected": -2.1336183547973633, "logps/chosen": -1.9815161228179932, "logps/rejected": -4.239256858825684, "loss": 0.4154, "rewards/accuracies": 1.0, "rewards/chosen": 1.1739434003829956, "rewards/margins": 0.6636585593223572, "rewards/rejected": 0.5102848410606384, "step": 2067 }, { "epoch": 1.12, "learning_rate": 8.462612047927973e-08, "logits/chosen": -2.0598971843719482, "logits/rejected": -2.278951644897461, "logps/chosen": -8.082539558410645, "logps/rejected": -1.5555903911590576, "loss": 0.9221, "rewards/accuracies": 0.0, "rewards/chosen": 0.6467060446739197, "rewards/margins": -0.41517752408981323, "rewards/rejected": 1.061883568763733, "step": 2068 }, { "epoch": 1.12, "learning_rate": 8.461036343388252e-08, "logits/chosen": -2.0523548126220703, "logits/rejected": -2.2697107791900635, "logps/chosen": -2.4567387104034424, "logps/rejected": -2.3749067783355713, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.5957652926445007, "rewards/margins": 0.008463084697723389, "rewards/rejected": 0.5873022079467773, "step": 2069 }, { "epoch": 1.12, "learning_rate": 8.459459978632939e-08, "logits/chosen": -2.0473780632019043, "logits/rejected": -2.0484941005706787, "logps/chosen": -1.0195692777633667, "logps/rejected": -4.586365699768066, "loss": 0.5057, "rewards/accuracies": 1.0, "rewards/chosen": 0.9148432016372681, "rewards/margins": 0.4184051752090454, "rewards/rejected": 0.49643802642822266, "step": 2070 }, { "epoch": 1.12, "learning_rate": 8.457882953962735e-08, "logits/chosen": -1.95890212059021, "logits/rejected": -1.9657188653945923, "logps/chosen": -1.775583267211914, "logps/rejected": -4.048245906829834, "loss": 0.3969, "rewards/accuracies": 1.0, "rewards/chosen": 1.1865713596343994, "rewards/margins": 0.7190043330192566, "rewards/rejected": 0.4675670266151428, "step": 2071 }, { "epoch": 1.12, "learning_rate": 8.456305269678466e-08, "logits/chosen": -2.1197421550750732, "logits/rejected": -2.1202361583709717, "logps/chosen": -2.6384010314941406, "logps/rejected": -3.569589853286743, "loss": 0.5217, "rewards/accuracies": 1.0, "rewards/chosen": 0.9621594548225403, "rewards/margins": 0.37846022844314575, "rewards/rejected": 0.5836992263793945, "step": 2072 }, { "epoch": 1.12, "learning_rate": 8.454726926081089e-08, "logits/chosen": -2.1143200397491455, "logits/rejected": -2.1838252544403076, "logps/chosen": -3.5755844116210938, "logps/rejected": -28.44684410095215, "loss": 0.4068, "rewards/accuracies": 1.0, "rewards/chosen": 1.1098570823669434, "rewards/margins": 0.6892586946487427, "rewards/rejected": 0.42059841752052307, "step": 2073 }, { "epoch": 1.12, "learning_rate": 8.45314792347168e-08, "logits/chosen": -2.052529811859131, "logits/rejected": -1.950466513633728, "logps/chosen": -34.813087463378906, "logps/rejected": -2.325972557067871, "loss": 0.2914, "rewards/accuracies": 1.0, "rewards/chosen": 1.8506832122802734, "rewards/margins": 1.0837461948394775, "rewards/rejected": 0.7669370770454407, "step": 2074 }, { "epoch": 1.12, "learning_rate": 8.451568262151446e-08, "logits/chosen": -1.9808788299560547, "logits/rejected": -2.293651580810547, "logps/chosen": -0.6890375018119812, "logps/rejected": -0.7087581753730774, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.8126888275146484, "rewards/margins": 0.005726337432861328, "rewards/rejected": 0.8069624900817871, "step": 2075 }, { "epoch": 1.12, "learning_rate": 8.449987942421718e-08, "logits/chosen": -2.045276641845703, "logits/rejected": -2.0456340312957764, "logps/chosen": -4.656004905700684, "logps/rejected": -3.6539580821990967, "loss": 0.2995, "rewards/accuracies": 1.0, "rewards/chosen": 1.5584622621536255, "rewards/margins": 1.0521328449249268, "rewards/rejected": 0.5063294768333435, "step": 2076 }, { "epoch": 1.12, "learning_rate": 8.448406964583952e-08, "logits/chosen": -2.038463592529297, "logits/rejected": -2.044238328933716, "logps/chosen": -1.2860037088394165, "logps/rejected": -3.855684280395508, "loss": 0.4816, "rewards/accuracies": 1.0, "rewards/chosen": 1.0428580045700073, "rewards/margins": 0.48013371229171753, "rewards/rejected": 0.5627242922782898, "step": 2077 }, { "epoch": 1.12, "learning_rate": 8.446825328939731e-08, "logits/chosen": -2.1419453620910645, "logits/rejected": -2.1424477100372314, "logps/chosen": -2.740732192993164, "logps/rejected": -5.778489112854004, "loss": 0.3748, "rewards/accuracies": 1.0, "rewards/chosen": 1.0677813291549683, "rewards/margins": 0.7880390882492065, "rewards/rejected": 0.2797422409057617, "step": 2078 }, { "epoch": 1.12, "learning_rate": 8.44524303579076e-08, "logits/chosen": -2.0848968029022217, "logits/rejected": -2.0875096321105957, "logps/chosen": -1.4843953847885132, "logps/rejected": -1.9958981275558472, "loss": 0.6086, "rewards/accuracies": 1.0, "rewards/chosen": 1.1306623220443726, "rewards/margins": 0.17692172527313232, "rewards/rejected": 0.9537405967712402, "step": 2079 }, { "epoch": 1.12, "learning_rate": 8.443660085438875e-08, "logits/chosen": -2.0019946098327637, "logits/rejected": -1.9973617792129517, "logps/chosen": -4.937733173370361, "logps/rejected": -2.1095893383026123, "loss": 0.3116, "rewards/accuracies": 1.0, "rewards/chosen": 1.5882854461669922, "rewards/margins": 1.0061321258544922, "rewards/rejected": 0.5821532607078552, "step": 2080 }, { "epoch": 1.12, "learning_rate": 8.442076478186032e-08, "logits/chosen": -2.1098060607910156, "logits/rejected": -2.1084041595458984, "logps/chosen": -4.43983793258667, "logps/rejected": -7.190855026245117, "loss": 0.246, "rewards/accuracies": 1.0, "rewards/chosen": 1.4982258081436157, "rewards/margins": 1.276965856552124, "rewards/rejected": 0.2212599813938141, "step": 2081 }, { "epoch": 1.12, "learning_rate": 8.440492214334316e-08, "logits/chosen": -2.0700273513793945, "logits/rejected": -2.286104440689087, "logps/chosen": -2.0714683532714844, "logps/rejected": -1.621416449546814, "loss": 0.7099, "rewards/accuracies": 0.0, "rewards/chosen": 0.4904550611972809, "rewards/margins": -0.033271580934524536, "rewards/rejected": 0.5237266421318054, "step": 2082 }, { "epoch": 1.12, "learning_rate": 8.438907294185934e-08, "logits/chosen": -2.1438708305358887, "logits/rejected": -2.339590072631836, "logps/chosen": -1.5064679384231567, "logps/rejected": -1.4802279472351074, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 1.008506417274475, "rewards/margins": 0.01567363739013672, "rewards/rejected": 0.9928327798843384, "step": 2083 }, { "epoch": 1.12, "learning_rate": 8.437321718043222e-08, "logits/chosen": -2.0304768085479736, "logits/rejected": -2.026885509490967, "logps/chosen": -3.476774215698242, "logps/rejected": -7.181359767913818, "loss": 0.4663, "rewards/accuracies": 1.0, "rewards/chosen": 1.151216983795166, "rewards/margins": 0.5208459496498108, "rewards/rejected": 0.6303710341453552, "step": 2084 }, { "epoch": 1.12, "learning_rate": 8.435735486208638e-08, "logits/chosen": -2.0300371646881104, "logits/rejected": -2.028930425643921, "logps/chosen": -0.6806944608688354, "logps/rejected": -3.9536848068237305, "loss": 0.5267, "rewards/accuracies": 1.0, "rewards/chosen": 0.901713490486145, "rewards/margins": 0.3663356900215149, "rewards/rejected": 0.5353778004646301, "step": 2085 }, { "epoch": 1.13, "learning_rate": 8.434148598984768e-08, "logits/chosen": -2.141324520111084, "logits/rejected": -2.284475564956665, "logps/chosen": -5.657828330993652, "logps/rejected": -0.821313738822937, "loss": 0.7785, "rewards/accuracies": 0.0, "rewards/chosen": 0.8515052795410156, "rewards/margins": -0.16396045684814453, "rewards/rejected": 1.0154657363891602, "step": 2086 }, { "epoch": 1.13, "learning_rate": 8.432561056674317e-08, "logits/chosen": -2.1437790393829346, "logits/rejected": -2.310353994369507, "logps/chosen": -0.6647488474845886, "logps/rejected": -0.7658568620681763, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 1.0980416536331177, "rewards/margins": 0.020144939422607422, "rewards/rejected": 1.0778967142105103, "step": 2087 }, { "epoch": 1.13, "learning_rate": 8.430972859580124e-08, "logits/chosen": -2.094217300415039, "logits/rejected": -2.284334182739258, "logps/chosen": -1.9230194091796875, "logps/rejected": -1.8331539630889893, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.5345562100410461, "rewards/margins": -0.00350797176361084, "rewards/rejected": 0.538064181804657, "step": 2088 }, { "epoch": 1.13, "learning_rate": 8.429384008005144e-08, "logits/chosen": -1.9782525300979614, "logits/rejected": -1.9745925664901733, "logps/chosen": -1.5930910110473633, "logps/rejected": -4.8830742835998535, "loss": 0.4826, "rewards/accuracies": 1.0, "rewards/chosen": 0.9862048029899597, "rewards/margins": 0.4776661992073059, "rewards/rejected": 0.5085386037826538, "step": 2089 }, { "epoch": 1.13, "learning_rate": 8.427794502252465e-08, "logits/chosen": -2.0652928352355957, "logits/rejected": -2.225944757461548, "logps/chosen": -1.144615888595581, "logps/rejected": -1.088528037071228, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 0.7814213633537292, "rewards/margins": 0.03749573230743408, "rewards/rejected": 0.7439256310462952, "step": 2090 }, { "epoch": 1.13, "learning_rate": 8.426204342625293e-08, "logits/chosen": -2.162076711654663, "logits/rejected": -1.9938825368881226, "logps/chosen": -38.9903678894043, "logps/rejected": -4.507257461547852, "loss": 0.2681, "rewards/accuracies": 1.0, "rewards/chosen": 1.901645302772522, "rewards/margins": 1.1794649362564087, "rewards/rejected": 0.7221803665161133, "step": 2091 }, { "epoch": 1.13, "learning_rate": 8.424613529426962e-08, "logits/chosen": -2.1421091556549072, "logits/rejected": -2.1233322620391846, "logps/chosen": -16.472145080566406, "logps/rejected": -3.7752952575683594, "loss": 0.3766, "rewards/accuracies": 1.0, "rewards/chosen": 1.2209198474884033, "rewards/margins": 0.7824730277061462, "rewards/rejected": 0.4384468197822571, "step": 2092 }, { "epoch": 1.13, "learning_rate": 8.423022062960933e-08, "logits/chosen": -2.139317035675049, "logits/rejected": -2.1300160884857178, "logps/chosen": -7.060724258422852, "logps/rejected": -3.393317461013794, "loss": 0.5972, "rewards/accuracies": 1.0, "rewards/chosen": 0.7631037831306458, "rewards/margins": 0.20209819078445435, "rewards/rejected": 0.5610055923461914, "step": 2093 }, { "epoch": 1.13, "learning_rate": 8.421429943530785e-08, "logits/chosen": -2.0839362144470215, "logits/rejected": -2.086580991744995, "logps/chosen": -0.8173553943634033, "logps/rejected": -3.4469382762908936, "loss": 0.4852, "rewards/accuracies": 1.0, "rewards/chosen": 1.065977931022644, "rewards/margins": 0.47092205286026, "rewards/rejected": 0.595055878162384, "step": 2094 }, { "epoch": 1.13, "learning_rate": 8.419837171440225e-08, "logits/chosen": -2.1543102264404297, "logits/rejected": -2.1172549724578857, "logps/chosen": -28.991378784179688, "logps/rejected": -12.879199028015137, "loss": 0.433, "rewards/accuracies": 1.0, "rewards/chosen": 1.5938224792480469, "rewards/margins": 0.6126770973205566, "rewards/rejected": 0.9811453819274902, "step": 2095 }, { "epoch": 1.13, "learning_rate": 8.418243746993085e-08, "logits/chosen": -2.1605682373046875, "logits/rejected": -2.1564900875091553, "logps/chosen": -2.795989513397217, "logps/rejected": -10.45451831817627, "loss": 0.3661, "rewards/accuracies": 1.0, "rewards/chosen": 1.3184404373168945, "rewards/margins": 0.8162009119987488, "rewards/rejected": 0.5022395253181458, "step": 2096 }, { "epoch": 1.13, "learning_rate": 8.416649670493326e-08, "logits/chosen": -2.015268087387085, "logits/rejected": -2.246784210205078, "logps/chosen": -0.9358950257301331, "logps/rejected": -1.0813137292861938, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 0.7440455555915833, "rewards/margins": 0.03854280710220337, "rewards/rejected": 0.7055027484893799, "step": 2097 }, { "epoch": 1.13, "learning_rate": 8.415054942245023e-08, "logits/chosen": -2.186969041824341, "logits/rejected": -2.2843940258026123, "logps/chosen": -5.201478481292725, "logps/rejected": -1.5829709768295288, "loss": 0.7677, "rewards/accuracies": 0.0, "rewards/chosen": 1.0989660024642944, "rewards/margins": -0.14392578601837158, "rewards/rejected": 1.242891788482666, "step": 2098 }, { "epoch": 1.13, "learning_rate": 8.413459562552384e-08, "logits/chosen": -2.0150747299194336, "logits/rejected": -2.3072664737701416, "logps/chosen": -4.352144241333008, "logps/rejected": -5.871100902557373, "loss": 0.5964, "rewards/accuracies": 1.0, "rewards/chosen": 0.8671509623527527, "rewards/margins": 0.20377850532531738, "rewards/rejected": 0.6633724570274353, "step": 2099 }, { "epoch": 1.13, "learning_rate": 8.411863531719737e-08, "logits/chosen": -2.003939628601074, "logits/rejected": -2.00138783454895, "logps/chosen": -9.000426292419434, "logps/rejected": -1.674180507659912, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 1.4135777950286865, "rewards/margins": 0.6113196611404419, "rewards/rejected": 0.8022581338882446, "step": 2100 }, { "epoch": 1.13, "learning_rate": 8.410266850051538e-08, "logits/chosen": -2.126385450363159, "logits/rejected": -2.1268458366394043, "logps/chosen": -2.6853578090667725, "logps/rejected": -2.359743356704712, "loss": 0.6299, "rewards/accuracies": 1.0, "rewards/chosen": 0.9412922263145447, "rewards/margins": 0.13087081909179688, "rewards/rejected": 0.8104214072227478, "step": 2101 }, { "epoch": 1.13, "learning_rate": 8.40866951785236e-08, "logits/chosen": -2.1528871059417725, "logits/rejected": -2.3010854721069336, "logps/chosen": -0.6584476828575134, "logps/rejected": -0.6419649720191956, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9522919058799744, "rewards/margins": 0.018102943897247314, "rewards/rejected": 0.934188961982727, "step": 2102 }, { "epoch": 1.13, "learning_rate": 8.407071535426907e-08, "logits/chosen": -2.0130302906036377, "logits/rejected": -2.260936737060547, "logps/chosen": -4.974888801574707, "logps/rejected": -4.990553855895996, "loss": 0.7033, "rewards/accuracies": 0.0, "rewards/chosen": 0.9864701628684998, "rewards/margins": -0.020235836505889893, "rewards/rejected": 1.0067059993743896, "step": 2103 }, { "epoch": 1.13, "learning_rate": 8.405472903080007e-08, "logits/chosen": -2.1739463806152344, "logits/rejected": -2.2624661922454834, "logps/chosen": -6.893190383911133, "logps/rejected": -3.4311134815216064, "loss": 0.7374, "rewards/accuracies": 0.0, "rewards/chosen": 0.5272789001464844, "rewards/margins": -0.08657068014144897, "rewards/rejected": 0.6138495802879333, "step": 2104 }, { "epoch": 1.14, "learning_rate": 8.403873621116607e-08, "logits/chosen": -2.0722739696502686, "logits/rejected": -2.0748767852783203, "logps/chosen": -2.219163656234741, "logps/rejected": -4.076261043548584, "loss": 0.4355, "rewards/accuracies": 1.0, "rewards/chosen": 1.1177607774734497, "rewards/margins": 0.6054823398590088, "rewards/rejected": 0.5122784376144409, "step": 2105 }, { "epoch": 1.14, "learning_rate": 8.402273689841778e-08, "logits/chosen": -2.0381321907043457, "logits/rejected": -2.041693925857544, "logps/chosen": -0.3497919738292694, "logps/rejected": -4.665491580963135, "loss": 0.5528, "rewards/accuracies": 1.0, "rewards/chosen": 0.9251778721809387, "rewards/margins": 0.30359023809432983, "rewards/rejected": 0.6215876340866089, "step": 2106 }, { "epoch": 1.14, "learning_rate": 8.400673109560723e-08, "logits/chosen": -2.1662659645080566, "logits/rejected": -2.1611921787261963, "logps/chosen": -5.989581108093262, "logps/rejected": -4.146897792816162, "loss": 0.3888, "rewards/accuracies": 1.0, "rewards/chosen": 1.323041319847107, "rewards/margins": 0.7440353631973267, "rewards/rejected": 0.5790059566497803, "step": 2107 }, { "epoch": 1.14, "learning_rate": 8.399071880578761e-08, "logits/chosen": -2.052889823913574, "logits/rejected": -2.118025064468384, "logps/chosen": -7.518869876861572, "logps/rejected": -17.882450103759766, "loss": 0.3646, "rewards/accuracies": 1.0, "rewards/chosen": 1.3425493240356445, "rewards/margins": 0.8211713433265686, "rewards/rejected": 0.5213779807090759, "step": 2108 }, { "epoch": 1.14, "learning_rate": 8.397470003201338e-08, "logits/chosen": -2.0763704776763916, "logits/rejected": -2.2921078205108643, "logps/chosen": -1.0372660160064697, "logps/rejected": -6.494906425476074, "loss": 0.6184, "rewards/accuracies": 1.0, "rewards/chosen": 1.0373384952545166, "rewards/margins": 0.15554946660995483, "rewards/rejected": 0.8817890286445618, "step": 2109 }, { "epoch": 1.14, "learning_rate": 8.395867477734021e-08, "logits/chosen": -2.084472417831421, "logits/rejected": -2.2529871463775635, "logps/chosen": -0.5186542868614197, "logps/rejected": -0.5572209358215332, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.7994951009750366, "rewards/margins": 0.003194272518157959, "rewards/rejected": 0.7963008284568787, "step": 2110 }, { "epoch": 1.14, "learning_rate": 8.394264304482504e-08, "logits/chosen": -2.115187883377075, "logits/rejected": -2.0119571685791016, "logps/chosen": -26.61609649658203, "logps/rejected": -3.3340418338775635, "loss": 0.2209, "rewards/accuracies": 1.0, "rewards/chosen": 1.9746780395507812, "rewards/margins": 1.39774751663208, "rewards/rejected": 0.5769304633140564, "step": 2111 }, { "epoch": 1.14, "learning_rate": 8.3926604837526e-08, "logits/chosen": -2.1760988235473633, "logits/rejected": -2.1844241619110107, "logps/chosen": -3.1181352138519287, "logps/rejected": -2.9207582473754883, "loss": 0.5384, "rewards/accuracies": 1.0, "rewards/chosen": 0.8658119440078735, "rewards/margins": 0.33800458908081055, "rewards/rejected": 0.527807354927063, "step": 2112 }, { "epoch": 1.14, "learning_rate": 8.39105601585025e-08, "logits/chosen": -2.096762180328369, "logits/rejected": -2.2819557189941406, "logps/chosen": -3.0622239112854004, "logps/rejected": -2.9787561893463135, "loss": 0.704, "rewards/accuracies": 0.0, "rewards/chosen": 1.0164690017700195, "rewards/margins": -0.021597623825073242, "rewards/rejected": 1.0380666255950928, "step": 2113 }, { "epoch": 1.14, "learning_rate": 8.389450901081518e-08, "logits/chosen": -2.086637258529663, "logits/rejected": -2.0866219997406006, "logps/chosen": -4.367063999176025, "logps/rejected": -2.1136507987976074, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 1.4484108686447144, "rewards/margins": 0.8447298407554626, "rewards/rejected": 0.6036810278892517, "step": 2114 }, { "epoch": 1.14, "learning_rate": 8.387845139752592e-08, "logits/chosen": -2.0064215660095215, "logits/rejected": -2.253082752227783, "logps/chosen": -0.4954138398170471, "logps/rejected": -0.5355231761932373, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 0.8587142825126648, "rewards/margins": -0.012394249439239502, "rewards/rejected": 0.8711085319519043, "step": 2115 }, { "epoch": 1.14, "learning_rate": 8.386238732169777e-08, "logits/chosen": -1.9892082214355469, "logits/rejected": -2.2248451709747314, "logps/chosen": -0.6053475141525269, "logps/rejected": -0.7076655626296997, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.8295294046401978, "rewards/margins": 0.013839423656463623, "rewards/rejected": 0.8156899809837341, "step": 2116 }, { "epoch": 1.14, "learning_rate": 8.384631678639507e-08, "logits/chosen": -2.1079399585723877, "logits/rejected": -2.3369033336639404, "logps/chosen": -0.3717455267906189, "logps/rejected": -0.40471625328063965, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.8872157335281372, "rewards/margins": 0.027287781238555908, "rewards/rejected": 0.8599279522895813, "step": 2117 }, { "epoch": 1.14, "learning_rate": 8.38302397946834e-08, "logits/chosen": -2.136779308319092, "logits/rejected": -2.1276965141296387, "logps/chosen": -7.717352867126465, "logps/rejected": -1.7110648155212402, "loss": 0.4177, "rewards/accuracies": 1.0, "rewards/chosen": 1.392590880393982, "rewards/margins": 0.6569925546646118, "rewards/rejected": 0.7355983257293701, "step": 2118 }, { "epoch": 1.14, "learning_rate": 8.381415634962955e-08, "logits/chosen": -1.9999573230743408, "logits/rejected": -2.0050783157348633, "logps/chosen": -2.80552077293396, "logps/rejected": -2.1762495040893555, "loss": 0.522, "rewards/accuracies": 1.0, "rewards/chosen": 1.4054174423217773, "rewards/margins": 0.3778609037399292, "rewards/rejected": 1.0275565385818481, "step": 2119 }, { "epoch": 1.14, "learning_rate": 8.379806645430154e-08, "logits/chosen": -2.174795389175415, "logits/rejected": -2.3292579650878906, "logps/chosen": -17.614830017089844, "logps/rejected": -4.437863349914551, "loss": 0.7485, "rewards/accuracies": 0.0, "rewards/chosen": 0.7623199820518494, "rewards/margins": -0.1078791618347168, "rewards/rejected": 0.8701991438865662, "step": 2120 }, { "epoch": 1.14, "learning_rate": 8.378197011176862e-08, "logits/chosen": -2.2818520069122314, "logits/rejected": -2.1615984439849854, "logps/chosen": -34.12097930908203, "logps/rejected": -3.705413341522217, "loss": 0.2654, "rewards/accuracies": 1.0, "rewards/chosen": 2.0915932655334473, "rewards/margins": 1.1907494068145752, "rewards/rejected": 0.9008437991142273, "step": 2121 }, { "epoch": 1.14, "learning_rate": 8.376586732510127e-08, "logits/chosen": -2.123307704925537, "logits/rejected": -2.1212897300720215, "logps/chosen": -6.093432903289795, "logps/rejected": -2.540431261062622, "loss": 0.2816, "rewards/accuracies": 1.0, "rewards/chosen": 1.6846626996994019, "rewards/margins": 1.1230545043945312, "rewards/rejected": 0.5616082549095154, "step": 2122 }, { "epoch": 1.15, "learning_rate": 8.37497580973712e-08, "logits/chosen": -1.999481201171875, "logits/rejected": -2.0058908462524414, "logps/chosen": -1.6084905862808228, "logps/rejected": -2.9619505405426025, "loss": 0.4002, "rewards/accuracies": 1.0, "rewards/chosen": 1.3082653284072876, "rewards/margins": 0.7090734839439392, "rewards/rejected": 0.5991918444633484, "step": 2123 }, { "epoch": 1.15, "learning_rate": 8.373364243165138e-08, "logits/chosen": -2.174170970916748, "logits/rejected": -2.248804807662964, "logps/chosen": -0.7238622307777405, "logps/rejected": -0.7263316512107849, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 1.0473341941833496, "rewards/margins": 0.021059513092041016, "rewards/rejected": 1.0262746810913086, "step": 2124 }, { "epoch": 1.15, "learning_rate": 8.371752033101594e-08, "logits/chosen": -2.079542636871338, "logits/rejected": -2.2752368450164795, "logps/chosen": -2.1382603645324707, "logps/rejected": -2.522703170776367, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.899761974811554, "rewards/margins": 0.017631351947784424, "rewards/rejected": 0.8821306228637695, "step": 2125 }, { "epoch": 1.15, "learning_rate": 8.370139179854032e-08, "logits/chosen": -2.042410135269165, "logits/rejected": -2.3025293350219727, "logps/chosen": -0.38813352584838867, "logps/rejected": -0.3631811738014221, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.7089549899101257, "rewards/margins": 0.012715637683868408, "rewards/rejected": 0.6962393522262573, "step": 2126 }, { "epoch": 1.15, "learning_rate": 8.36852568373011e-08, "logits/chosen": -2.21138596534729, "logits/rejected": -2.065168619155884, "logps/chosen": -36.286712646484375, "logps/rejected": -10.496009826660156, "loss": 0.267, "rewards/accuracies": 1.0, "rewards/chosen": 1.7661151885986328, "rewards/margins": 1.1838653087615967, "rewards/rejected": 0.5822498202323914, "step": 2127 }, { "epoch": 1.15, "learning_rate": 8.366911545037617e-08, "logits/chosen": -1.9488872289657593, "logits/rejected": -1.9082845449447632, "logps/chosen": -13.865622520446777, "logps/rejected": -8.694066047668457, "loss": 0.3271, "rewards/accuracies": 1.0, "rewards/chosen": 1.3269946575164795, "rewards/margins": 0.9496269226074219, "rewards/rejected": 0.37736770510673523, "step": 2128 }, { "epoch": 1.15, "learning_rate": 8.365296764084456e-08, "logits/chosen": -2.0413830280303955, "logits/rejected": -2.2385547161102295, "logps/chosen": -0.3149532377719879, "logps/rejected": -0.31411612033843994, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.954345166683197, "rewards/margins": 0.022850751876831055, "rewards/rejected": 0.931494414806366, "step": 2129 }, { "epoch": 1.15, "learning_rate": 8.363681341178664e-08, "logits/chosen": -2.222846508026123, "logits/rejected": -2.279999017715454, "logps/chosen": -10.876131057739258, "logps/rejected": -8.26994514465332, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.8272585272789001, "rewards/margins": 0.027607381343841553, "rewards/rejected": 0.7996511459350586, "step": 2130 }, { "epoch": 1.15, "learning_rate": 8.362065276628387e-08, "logits/chosen": -2.0943832397460938, "logits/rejected": -2.2895612716674805, "logps/chosen": -3.24835467338562, "logps/rejected": -3.3917689323425293, "loss": 0.6543, "rewards/accuracies": 1.0, "rewards/chosen": 0.6130877733230591, "rewards/margins": 0.07920879125595093, "rewards/rejected": 0.5338789820671082, "step": 2131 }, { "epoch": 1.15, "learning_rate": 8.360448570741903e-08, "logits/chosen": -2.019490957260132, "logits/rejected": -2.0228641033172607, "logps/chosen": -0.36233365535736084, "logps/rejected": -4.789161205291748, "loss": 0.5451, "rewards/accuracies": 1.0, "rewards/chosen": 0.7481760382652283, "rewards/margins": 0.32182595133781433, "rewards/rejected": 0.42635008692741394, "step": 2132 }, { "epoch": 1.15, "learning_rate": 8.35883122382761e-08, "logits/chosen": -2.119293689727783, "logits/rejected": -2.1188390254974365, "logps/chosen": -0.30960842967033386, "logps/rejected": -4.044332027435303, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957987666130066, "rewards/margins": 0.4805448651313782, "rewards/rejected": 0.5152539014816284, "step": 2133 }, { "epoch": 1.15, "learning_rate": 8.357213236194025e-08, "logits/chosen": -2.1063592433929443, "logits/rejected": -2.11248517036438, "logps/chosen": -12.727768898010254, "logps/rejected": -7.600829124450684, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 2.1278462409973145, "rewards/margins": 1.3776029348373413, "rewards/rejected": 0.7502433061599731, "step": 2134 }, { "epoch": 1.15, "learning_rate": 8.355594608149792e-08, "logits/chosen": -2.229551076889038, "logits/rejected": -2.0826590061187744, "logps/chosen": -56.139373779296875, "logps/rejected": -22.985118865966797, "loss": 0.217, "rewards/accuracies": 1.0, "rewards/chosen": 2.3304107189178467, "rewards/margins": 1.4171831607818604, "rewards/rejected": 0.9132274985313416, "step": 2135 }, { "epoch": 1.15, "learning_rate": 8.353975340003676e-08, "logits/chosen": -2.050584077835083, "logits/rejected": -2.2798287868499756, "logps/chosen": -0.80836421251297, "logps/rejected": -0.873136579990387, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 1.0372945070266724, "rewards/margins": -0.00766444206237793, "rewards/rejected": 1.0449589490890503, "step": 2136 }, { "epoch": 1.15, "learning_rate": 8.352355432064562e-08, "logits/chosen": -2.1335160732269287, "logits/rejected": -2.3219008445739746, "logps/chosen": -8.616476058959961, "logps/rejected": -7.159640312194824, "loss": 0.5147, "rewards/accuracies": 1.0, "rewards/chosen": 1.1424686908721924, "rewards/margins": 0.39580607414245605, "rewards/rejected": 0.7466626167297363, "step": 2137 }, { "epoch": 1.15, "learning_rate": 8.350734884641455e-08, "logits/chosen": -2.1366329193115234, "logits/rejected": -2.135127067565918, "logps/chosen": -7.141218662261963, "logps/rejected": -1.9466496706008911, "loss": 0.4868, "rewards/accuracies": 1.0, "rewards/chosen": 1.2396233081817627, "rewards/margins": 0.46658241748809814, "rewards/rejected": 0.7730408906936646, "step": 2138 }, { "epoch": 1.15, "learning_rate": 8.349113698043491e-08, "logits/chosen": -2.147494077682495, "logits/rejected": -2.1393892765045166, "logps/chosen": -1.327717900276184, "logps/rejected": -5.736552715301514, "loss": 0.4587, "rewards/accuracies": 1.0, "rewards/chosen": 1.0552865266799927, "rewards/margins": 0.5412253737449646, "rewards/rejected": 0.5140611529350281, "step": 2139 }, { "epoch": 1.15, "learning_rate": 8.347491872579916e-08, "logits/chosen": -2.1407413482666016, "logits/rejected": -2.140073537826538, "logps/chosen": -24.40393829345703, "logps/rejected": -8.47669792175293, "loss": 0.3683, "rewards/accuracies": 1.0, "rewards/chosen": 1.2521454095840454, "rewards/margins": 0.8090840578079224, "rewards/rejected": 0.44306135177612305, "step": 2140 }, { "epoch": 1.15, "learning_rate": 8.345869408560109e-08, "logits/chosen": -2.0105793476104736, "logits/rejected": -2.34191632270813, "logps/chosen": -5.891473770141602, "logps/rejected": -5.332913875579834, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.6617441177368164, "rewards/margins": -0.005861818790435791, "rewards/rejected": 0.6676059365272522, "step": 2141 }, { "epoch": 1.16, "learning_rate": 8.344246306293563e-08, "logits/chosen": -2.143784761428833, "logits/rejected": -2.1735944747924805, "logps/chosen": -4.600837230682373, "logps/rejected": -20.797971725463867, "loss": 0.2008, "rewards/accuracies": 1.0, "rewards/chosen": 1.6815649271011353, "rewards/margins": 1.503179907798767, "rewards/rejected": 0.17838497459888458, "step": 2142 }, { "epoch": 1.16, "learning_rate": 8.342622566089896e-08, "logits/chosen": -2.0087037086486816, "logits/rejected": -2.3035640716552734, "logps/chosen": -1.2642589807510376, "logps/rejected": -0.6302428245544434, "loss": 0.7372, "rewards/accuracies": 0.0, "rewards/chosen": 0.9587309956550598, "rewards/margins": -0.08627992868423462, "rewards/rejected": 1.0450109243392944, "step": 2143 }, { "epoch": 1.16, "learning_rate": 8.340998188258849e-08, "logits/chosen": -1.9288501739501953, "logits/rejected": -2.2353665828704834, "logps/chosen": -0.9697054028511047, "logps/rejected": -1.018836259841919, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.7966130375862122, "rewards/margins": 0.029958128929138184, "rewards/rejected": 0.766654908657074, "step": 2144 }, { "epoch": 1.16, "learning_rate": 8.339373173110279e-08, "logits/chosen": -2.0427451133728027, "logits/rejected": -2.232283115386963, "logps/chosen": -0.6789935231208801, "logps/rejected": -0.6103301644325256, "loss": 0.6982, "rewards/accuracies": 0.0, "rewards/chosen": 0.7665361166000366, "rewards/margins": -0.010144054889678955, "rewards/rejected": 0.7766801714897156, "step": 2145 }, { "epoch": 1.16, "learning_rate": 8.337747520954172e-08, "logits/chosen": -2.170142650604248, "logits/rejected": -2.3104617595672607, "logps/chosen": -2.343210458755493, "logps/rejected": -2.358097553253174, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.8116330504417419, "rewards/margins": 0.003438711166381836, "rewards/rejected": 0.8081943392753601, "step": 2146 }, { "epoch": 1.16, "learning_rate": 8.33612123210063e-08, "logits/chosen": -2.0902516841888428, "logits/rejected": -2.087003231048584, "logps/chosen": -0.4183000922203064, "logps/rejected": -4.245179176330566, "loss": 0.4817, "rewards/accuracies": 1.0, "rewards/chosen": 0.9746281504631042, "rewards/margins": 0.480009526014328, "rewards/rejected": 0.49461862444877625, "step": 2147 }, { "epoch": 1.16, "learning_rate": 8.33449430685988e-08, "logits/chosen": -2.205349922180176, "logits/rejected": -2.0525033473968506, "logps/chosen": -37.79588317871094, "logps/rejected": -3.9524333477020264, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 2.5253376960754395, "rewards/margins": 2.014221668243408, "rewards/rejected": 0.5111160278320312, "step": 2148 }, { "epoch": 1.16, "learning_rate": 8.332866745542266e-08, "logits/chosen": -2.1002864837646484, "logits/rejected": -2.1269936561584473, "logps/chosen": -2.2061774730682373, "logps/rejected": -6.710463047027588, "loss": 0.4672, "rewards/accuracies": 1.0, "rewards/chosen": 1.1696590185165405, "rewards/margins": 0.5183382034301758, "rewards/rejected": 0.6513208150863647, "step": 2149 }, { "epoch": 1.16, "learning_rate": 8.331238548458259e-08, "logits/chosen": -2.039191484451294, "logits/rejected": -2.0471062660217285, "logps/chosen": -3.9720234870910645, "logps/rejected": -2.880387544631958, "loss": 0.4522, "rewards/accuracies": 1.0, "rewards/chosen": 1.1900005340576172, "rewards/margins": 0.559056282043457, "rewards/rejected": 0.6309442520141602, "step": 2150 }, { "epoch": 1.16, "learning_rate": 8.329609715918446e-08, "logits/chosen": -2.129546642303467, "logits/rejected": -2.3097939491271973, "logps/chosen": -3.682692050933838, "logps/rejected": -3.6114375591278076, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.8661384582519531, "rewards/margins": 0.000759422779083252, "rewards/rejected": 0.8653790354728699, "step": 2151 }, { "epoch": 1.16, "learning_rate": 8.32798024823354e-08, "logits/chosen": -2.0342459678649902, "logits/rejected": -2.2633440494537354, "logps/chosen": -1.8714197874069214, "logps/rejected": -0.8737409710884094, "loss": 0.6171, "rewards/accuracies": 1.0, "rewards/chosen": 1.0823583602905273, "rewards/margins": 0.15835905075073242, "rewards/rejected": 0.9239993095397949, "step": 2152 }, { "epoch": 1.16, "learning_rate": 8.326350145714371e-08, "logits/chosen": -2.15432071685791, "logits/rejected": -2.328850269317627, "logps/chosen": -1.3354475498199463, "logps/rejected": -1.3109591007232666, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.865994393825531, "rewards/margins": -0.013953864574432373, "rewards/rejected": 0.8799482583999634, "step": 2153 }, { "epoch": 1.16, "learning_rate": 8.324719408671892e-08, "logits/chosen": -2.141305923461914, "logits/rejected": -2.3349711894989014, "logps/chosen": -6.608443737030029, "logps/rejected": -6.5349273681640625, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.8143331408500671, "rewards/margins": 0.001193702220916748, "rewards/rejected": 0.8131394386291504, "step": 2154 }, { "epoch": 1.16, "learning_rate": 8.323088037417179e-08, "logits/chosen": -1.9414695501327515, "logits/rejected": -2.2413220405578613, "logps/chosen": -3.9845595359802246, "logps/rejected": -4.4989519119262695, "loss": 0.6614, "rewards/accuracies": 1.0, "rewards/chosen": 0.9523548483848572, "rewards/margins": 0.06455641984939575, "rewards/rejected": 0.8877984285354614, "step": 2155 }, { "epoch": 1.16, "learning_rate": 8.321456032261423e-08, "logits/chosen": -2.0505526065826416, "logits/rejected": -2.050213098526001, "logps/chosen": -0.8976263999938965, "logps/rejected": -2.7376532554626465, "loss": 0.4995, "rewards/accuracies": 1.0, "rewards/chosen": 1.0445997714996338, "rewards/margins": 0.4340220093727112, "rewards/rejected": 0.6105777621269226, "step": 2156 }, { "epoch": 1.16, "learning_rate": 8.31982339351594e-08, "logits/chosen": -2.1814565658569336, "logits/rejected": -2.1778409481048584, "logps/chosen": -7.995329856872559, "logps/rejected": -0.6604896187782288, "loss": 0.6145, "rewards/accuracies": 1.0, "rewards/chosen": 0.8637523055076599, "rewards/margins": 0.16391396522521973, "rewards/rejected": 0.6998383402824402, "step": 2157 }, { "epoch": 1.16, "learning_rate": 8.318190121492173e-08, "logits/chosen": -1.9606329202651978, "logits/rejected": -2.2963812351226807, "logps/chosen": -1.1424360275268555, "logps/rejected": -1.08024263381958, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 1.0049995183944702, "rewards/margins": 0.02588897943496704, "rewards/rejected": 0.9791105389595032, "step": 2158 }, { "epoch": 1.16, "learning_rate": 8.316556216501674e-08, "logits/chosen": -2.0393269062042236, "logits/rejected": -2.290323257446289, "logps/chosen": -0.5232018232345581, "logps/rejected": -0.5369825959205627, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.8837451338768005, "rewards/margins": -0.004019260406494141, "rewards/rejected": 0.8877643942832947, "step": 2159 }, { "epoch": 1.17, "learning_rate": 8.314921678856119e-08, "logits/chosen": -2.136690855026245, "logits/rejected": -2.287803888320923, "logps/chosen": -0.7517395615577698, "logps/rejected": -2.9415464401245117, "loss": 0.6429, "rewards/accuracies": 1.0, "rewards/chosen": 1.1140421628952026, "rewards/margins": 0.10308659076690674, "rewards/rejected": 1.010955572128296, "step": 2160 }, { "epoch": 1.17, "learning_rate": 8.313286508867313e-08, "logits/chosen": -2.113999128341675, "logits/rejected": -2.125709295272827, "logps/chosen": -2.924726963043213, "logps/rejected": -12.40358829498291, "loss": 0.536, "rewards/accuracies": 1.0, "rewards/chosen": 1.247314691543579, "rewards/margins": 0.3436186909675598, "rewards/rejected": 0.9036960005760193, "step": 2161 }, { "epoch": 1.17, "learning_rate": 8.311650706847171e-08, "logits/chosen": -2.0561037063598633, "logits/rejected": -2.333507776260376, "logps/chosen": -4.423092365264893, "logps/rejected": -3.276992082595825, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.8209484219551086, "rewards/margins": -0.004867672920227051, "rewards/rejected": 0.8258160948753357, "step": 2162 }, { "epoch": 1.17, "learning_rate": 8.310014273107735e-08, "logits/chosen": -2.152473211288452, "logits/rejected": -2.294358968734741, "logps/chosen": -2.1444122791290283, "logps/rejected": -2.109930992126465, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": 0.9652994275093079, "rewards/margins": -0.022041916847229004, "rewards/rejected": 0.9873413443565369, "step": 2163 }, { "epoch": 1.17, "learning_rate": 8.308377207961166e-08, "logits/chosen": -2.209303379058838, "logits/rejected": -2.2048556804656982, "logps/chosen": -1.243061900138855, "logps/rejected": -3.8608479499816895, "loss": 0.5168, "rewards/accuracies": 1.0, "rewards/chosen": 1.0379637479782104, "rewards/margins": 0.3906669616699219, "rewards/rejected": 0.6472967863082886, "step": 2164 }, { "epoch": 1.17, "learning_rate": 8.306739511719744e-08, "logits/chosen": -2.0513570308685303, "logits/rejected": -2.3046693801879883, "logps/chosen": -3.652665615081787, "logps/rejected": -3.936349630355835, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.7125435471534729, "rewards/margins": -0.014099359512329102, "rewards/rejected": 0.726642906665802, "step": 2165 }, { "epoch": 1.17, "learning_rate": 8.30510118469587e-08, "logits/chosen": -2.0135648250579834, "logits/rejected": -2.006917715072632, "logps/chosen": -31.14627456665039, "logps/rejected": -8.296539306640625, "loss": 0.2792, "rewards/accuracies": 1.0, "rewards/chosen": 1.701209306716919, "rewards/margins": 1.1328914165496826, "rewards/rejected": 0.5683178305625916, "step": 2166 }, { "epoch": 1.17, "learning_rate": 8.303462227202067e-08, "logits/chosen": -2.228231906890869, "logits/rejected": -2.1931793689727783, "logps/chosen": -25.19396209716797, "logps/rejected": -26.398765563964844, "loss": 0.5511, "rewards/accuracies": 1.0, "rewards/chosen": 1.3956711292266846, "rewards/margins": 0.307735800743103, "rewards/rejected": 1.0879353284835815, "step": 2167 }, { "epoch": 1.17, "learning_rate": 8.301822639550975e-08, "logits/chosen": -2.2012383937835693, "logits/rejected": -2.1953940391540527, "logps/chosen": -1.9047199487686157, "logps/rejected": -10.700545310974121, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 1.3408551216125488, "rewards/margins": 0.8373231887817383, "rewards/rejected": 0.5035319328308105, "step": 2168 }, { "epoch": 1.17, "learning_rate": 8.300182422055358e-08, "logits/chosen": -1.9686899185180664, "logits/rejected": -2.0337929725646973, "logps/chosen": -5.005902290344238, "logps/rejected": -22.822593688964844, "loss": 0.2934, "rewards/accuracies": 1.0, "rewards/chosen": 1.4798266887664795, "rewards/margins": 1.0759440660476685, "rewards/rejected": 0.40388259291648865, "step": 2169 }, { "epoch": 1.17, "learning_rate": 8.298541575028097e-08, "logits/chosen": -1.9886900186538696, "logits/rejected": -2.2500133514404297, "logps/chosen": -0.6101405620574951, "logps/rejected": -0.7855882048606873, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.8943468928337097, "rewards/margins": 0.0038750171661376953, "rewards/rejected": 0.890471875667572, "step": 2170 }, { "epoch": 1.17, "learning_rate": 8.296900098782197e-08, "logits/chosen": -2.0502350330352783, "logits/rejected": -2.0548083782196045, "logps/chosen": -1.2003934383392334, "logps/rejected": -18.323307037353516, "loss": 0.6501, "rewards/accuracies": 1.0, "rewards/chosen": 1.04326331615448, "rewards/margins": 0.08812034130096436, "rewards/rejected": 0.9551429748535156, "step": 2171 }, { "epoch": 1.17, "learning_rate": 8.295257993630778e-08, "logits/chosen": -2.0860843658447266, "logits/rejected": -2.0878918170928955, "logps/chosen": -4.587864398956299, "logps/rejected": -2.431044101715088, "loss": 0.2464, "rewards/accuracies": 1.0, "rewards/chosen": 1.8716404438018799, "rewards/margins": 1.275275468826294, "rewards/rejected": 0.5963650345802307, "step": 2172 }, { "epoch": 1.17, "learning_rate": 8.293615259887084e-08, "logits/chosen": -2.0507724285125732, "logits/rejected": -2.2918941974639893, "logps/chosen": -1.2137295007705688, "logps/rejected": -1.2766296863555908, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9531688094139099, "rewards/margins": 0.012369632720947266, "rewards/rejected": 0.9407991766929626, "step": 2173 }, { "epoch": 1.17, "learning_rate": 8.291971897864474e-08, "logits/chosen": -1.9459993839263916, "logits/rejected": -2.2205753326416016, "logps/chosen": -1.127731442451477, "logps/rejected": -1.0190647840499878, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.9859231114387512, "rewards/margins": 0.04248541593551636, "rewards/rejected": 0.9434376955032349, "step": 2174 }, { "epoch": 1.17, "learning_rate": 8.290327907876435e-08, "logits/chosen": -2.0220417976379395, "logits/rejected": -2.247154951095581, "logps/chosen": -3.3015897274017334, "logps/rejected": -3.517198324203491, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.7317792177200317, "rewards/margins": 0.026490330696105957, "rewards/rejected": 0.7052888870239258, "step": 2175 }, { "epoch": 1.17, "learning_rate": 8.288683290236566e-08, "logits/chosen": -2.045471668243408, "logits/rejected": -2.246535301208496, "logps/chosen": -0.5510832667350769, "logps/rejected": -0.4702689051628113, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.7642031311988831, "rewards/margins": 0.006690859794616699, "rewards/rejected": 0.7575122714042664, "step": 2176 }, { "epoch": 1.17, "learning_rate": 8.287038045258589e-08, "logits/chosen": -2.037177562713623, "logits/rejected": -2.05389666557312, "logps/chosen": -0.8363046646118164, "logps/rejected": -3.4493215084075928, "loss": 0.5924, "rewards/accuracies": 1.0, "rewards/chosen": 1.0101221799850464, "rewards/margins": 0.21277040243148804, "rewards/rejected": 0.7973517775535583, "step": 2177 }, { "epoch": 1.17, "learning_rate": 8.285392173256347e-08, "logits/chosen": -2.029547929763794, "logits/rejected": -2.2836601734161377, "logps/chosen": -2.4105589389801025, "logps/rejected": -2.4325783252716064, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.8515334129333496, "rewards/margins": 0.0033614039421081543, "rewards/rejected": 0.8481720089912415, "step": 2178 }, { "epoch": 1.18, "learning_rate": 8.283745674543798e-08, "logits/chosen": -2.0452404022216797, "logits/rejected": -2.247399091720581, "logps/chosen": -3.915520668029785, "logps/rejected": -3.9214017391204834, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 0.7487292289733887, "rewards/margins": 0.04492378234863281, "rewards/rejected": 0.7038054466247559, "step": 2179 }, { "epoch": 1.18, "learning_rate": 8.282098549435024e-08, "logits/chosen": -1.9896047115325928, "logits/rejected": -2.29978346824646, "logps/chosen": -2.1073150634765625, "logps/rejected": -2.213700294494629, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.6001591682434082, "rewards/margins": 0.026252448558807373, "rewards/rejected": 0.5739067196846008, "step": 2180 }, { "epoch": 1.18, "learning_rate": 8.280450798244226e-08, "logits/chosen": -1.9727153778076172, "logits/rejected": -1.9779983758926392, "logps/chosen": -0.9496270418167114, "logps/rejected": -1.7448374032974243, "loss": 0.5143, "rewards/accuracies": 1.0, "rewards/chosen": 1.1620076894760132, "rewards/margins": 0.3968796133995056, "rewards/rejected": 0.7651280760765076, "step": 2181 }, { "epoch": 1.18, "learning_rate": 8.278802421285724e-08, "logits/chosen": -2.002578020095825, "logits/rejected": -2.1561522483825684, "logps/chosen": -1.1570253372192383, "logps/rejected": -1.1098637580871582, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": 0.9041592478752136, "rewards/margins": -0.008270561695098877, "rewards/rejected": 0.9124298095703125, "step": 2182 }, { "epoch": 1.18, "learning_rate": 8.277153418873955e-08, "logits/chosen": -2.056044816970825, "logits/rejected": -2.2699127197265625, "logps/chosen": -0.7987770438194275, "logps/rejected": -0.9808896780014038, "loss": 0.6732, "rewards/accuracies": 1.0, "rewards/chosen": 0.7795396447181702, "rewards/margins": 0.04032182693481445, "rewards/rejected": 0.7392178177833557, "step": 2183 }, { "epoch": 1.18, "learning_rate": 8.275503791323474e-08, "logits/chosen": -2.079209327697754, "logits/rejected": -2.090564250946045, "logps/chosen": -2.215245246887207, "logps/rejected": -4.553290843963623, "loss": 0.6674, "rewards/accuracies": 1.0, "rewards/chosen": 1.05750572681427, "rewards/margins": 0.05215156078338623, "rewards/rejected": 1.0053541660308838, "step": 2184 }, { "epoch": 1.18, "learning_rate": 8.273853538948967e-08, "logits/chosen": -2.072798013687134, "logits/rejected": -2.346303701400757, "logps/chosen": -0.22808176279067993, "logps/rejected": -0.44448214769363403, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 1.0184096097946167, "rewards/margins": 0.04277890920639038, "rewards/rejected": 0.9756307005882263, "step": 2185 }, { "epoch": 1.18, "learning_rate": 8.272202662065223e-08, "logits/chosen": -2.056318521499634, "logits/rejected": -2.293006658554077, "logps/chosen": -1.6800165176391602, "logps/rejected": -17.34626579284668, "loss": 0.3972, "rewards/accuracies": 1.0, "rewards/chosen": 0.9878862500190735, "rewards/margins": 0.7180614471435547, "rewards/rejected": 0.2698248028755188, "step": 2186 }, { "epoch": 1.18, "learning_rate": 8.270551160987161e-08, "logits/chosen": -2.086923360824585, "logits/rejected": -2.27614426612854, "logps/chosen": -0.3145914077758789, "logps/rejected": -0.3293312191963196, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.8027090430259705, "rewards/margins": 0.006115913391113281, "rewards/rejected": 0.7965931296348572, "step": 2187 }, { "epoch": 1.18, "learning_rate": 8.268899036029815e-08, "logits/chosen": -2.100982427597046, "logits/rejected": -2.295128107070923, "logps/chosen": -0.7100516557693481, "logps/rejected": -0.7205588817596436, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8259391188621521, "rewards/margins": 0.00875866413116455, "rewards/rejected": 0.8171804547309875, "step": 2188 }, { "epoch": 1.18, "learning_rate": 8.267246287508338e-08, "logits/chosen": -2.0533480644226074, "logits/rejected": -2.046523094177246, "logps/chosen": -4.140654563903809, "logps/rejected": -3.0522937774658203, "loss": 0.5373, "rewards/accuracies": 1.0, "rewards/chosen": 0.9055100679397583, "rewards/margins": 0.3406435251235962, "rewards/rejected": 0.5648665428161621, "step": 2189 }, { "epoch": 1.18, "learning_rate": 8.265592915738005e-08, "logits/chosen": -1.8810824155807495, "logits/rejected": -2.2656662464141846, "logps/chosen": -0.9525820016860962, "logps/rejected": -1.058188796043396, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 1.0589793920516968, "rewards/margins": -0.0015434026718139648, "rewards/rejected": 1.0605227947235107, "step": 2190 }, { "epoch": 1.18, "learning_rate": 8.263938921034205e-08, "logits/chosen": -2.0906875133514404, "logits/rejected": -2.0916190147399902, "logps/chosen": -3.6652417182922363, "logps/rejected": -0.8696127533912659, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.7114636301994324, "rewards/margins": 0.018112897872924805, "rewards/rejected": 0.6933507323265076, "step": 2191 }, { "epoch": 1.18, "learning_rate": 8.26228430371245e-08, "logits/chosen": -2.107313632965088, "logits/rejected": -2.1062543392181396, "logps/chosen": -1.2812690734863281, "logps/rejected": -3.1607277393341064, "loss": 0.5419, "rewards/accuracies": 1.0, "rewards/chosen": 0.9037055969238281, "rewards/margins": 0.32948607206344604, "rewards/rejected": 0.5742195248603821, "step": 2192 }, { "epoch": 1.18, "learning_rate": 8.26062906408837e-08, "logits/chosen": -2.16013765335083, "logits/rejected": -2.119126319885254, "logps/chosen": -22.322078704833984, "logps/rejected": -3.7897450923919678, "loss": 0.3088, "rewards/accuracies": 1.0, "rewards/chosen": 1.603920817375183, "rewards/margins": 1.0165199041366577, "rewards/rejected": 0.5874009132385254, "step": 2193 }, { "epoch": 1.18, "learning_rate": 8.258973202477708e-08, "logits/chosen": -2.061849594116211, "logits/rejected": -2.2584228515625, "logps/chosen": -1.121300220489502, "logps/rejected": -1.1346853971481323, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.9478643536567688, "rewards/margins": -0.017651081085205078, "rewards/rejected": 0.9655154347419739, "step": 2194 }, { "epoch": 1.18, "learning_rate": 8.257316719196337e-08, "logits/chosen": -1.950980305671692, "logits/rejected": -1.9492027759552002, "logps/chosen": -8.025449752807617, "logps/rejected": -3.0306286811828613, "loss": 0.3893, "rewards/accuracies": 1.0, "rewards/chosen": 1.3750892877578735, "rewards/margins": 0.7423760890960693, "rewards/rejected": 0.6327131986618042, "step": 2195 }, { "epoch": 1.18, "learning_rate": 8.25565961456024e-08, "logits/chosen": -2.0638670921325684, "logits/rejected": -2.061680555343628, "logps/chosen": -1.0544201135635376, "logps/rejected": -1.7348757982254028, "loss": 0.5872, "rewards/accuracies": 1.0, "rewards/chosen": 1.00826895236969, "rewards/margins": 0.22439855337142944, "rewards/rejected": 0.7838703989982605, "step": 2196 }, { "epoch": 1.19, "learning_rate": 8.254001888885518e-08, "logits/chosen": -2.0180816650390625, "logits/rejected": -2.249746799468994, "logps/chosen": -0.8463175296783447, "logps/rejected": -0.8975532650947571, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": 0.9737125635147095, "rewards/margins": 0.0642278790473938, "rewards/rejected": 0.9094846844673157, "step": 2197 }, { "epoch": 1.19, "learning_rate": 8.252343542488396e-08, "logits/chosen": -1.9995933771133423, "logits/rejected": -2.2452948093414307, "logps/chosen": -2.4224562644958496, "logps/rejected": -2.301270008087158, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.6851020455360413, "rewards/margins": 0.02846813201904297, "rewards/rejected": 0.6566339135169983, "step": 2198 }, { "epoch": 1.19, "learning_rate": 8.250684575685213e-08, "logits/chosen": -2.227036237716675, "logits/rejected": -2.1780288219451904, "logps/chosen": -24.438053131103516, "logps/rejected": -4.109597206115723, "loss": 0.2505, "rewards/accuracies": 1.0, "rewards/chosen": 2.0446362495422363, "rewards/margins": 1.2565054893493652, "rewards/rejected": 0.7881307601928711, "step": 2199 }, { "epoch": 1.19, "learning_rate": 8.24902498879243e-08, "logits/chosen": -2.1452796459198, "logits/rejected": -2.170779228210449, "logps/chosen": -1.0273411273956299, "logps/rejected": -9.210683822631836, "loss": 0.4349, "rewards/accuracies": 1.0, "rewards/chosen": 1.081958293914795, "rewards/margins": 0.607214093208313, "rewards/rejected": 0.4747442305088043, "step": 2200 }, { "epoch": 1.19, "learning_rate": 8.247364782126625e-08, "logits/chosen": -2.1456992626190186, "logits/rejected": -2.3379812240600586, "logps/chosen": -0.9700202345848083, "logps/rejected": -0.936407208442688, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9293138384819031, "rewards/margins": 0.005598604679107666, "rewards/rejected": 0.9237152338027954, "step": 2201 }, { "epoch": 1.19, "learning_rate": 8.245703956004489e-08, "logits/chosen": -2.0349879264831543, "logits/rejected": -2.033475637435913, "logps/chosen": -6.2134108543396, "logps/rejected": -2.359560251235962, "loss": 0.3274, "rewards/accuracies": 1.0, "rewards/chosen": 1.5702089071273804, "rewards/margins": 0.9484426379203796, "rewards/rejected": 0.6217662692070007, "step": 2202 }, { "epoch": 1.19, "learning_rate": 8.244042510742836e-08, "logits/chosen": -2.0602641105651855, "logits/rejected": -2.0457704067230225, "logps/chosen": -19.23366928100586, "logps/rejected": -5.10784912109375, "loss": 0.4802, "rewards/accuracies": 1.0, "rewards/chosen": 1.3612384796142578, "rewards/margins": 0.4838833808898926, "rewards/rejected": 0.8773550987243652, "step": 2203 }, { "epoch": 1.19, "learning_rate": 8.242380446658601e-08, "logits/chosen": -2.104490041732788, "logits/rejected": -2.074901819229126, "logps/chosen": -27.77373504638672, "logps/rejected": -16.851476669311523, "loss": 0.3596, "rewards/accuracies": 1.0, "rewards/chosen": 1.7213653326034546, "rewards/margins": 0.8376880288124084, "rewards/rejected": 0.8836773037910461, "step": 2204 }, { "epoch": 1.19, "learning_rate": 8.240717764068835e-08, "logits/chosen": -2.1473402976989746, "logits/rejected": -2.1510732173919678, "logps/chosen": -3.5495917797088623, "logps/rejected": -9.651220321655273, "loss": 0.4338, "rewards/accuracies": 1.0, "rewards/chosen": 1.3295073509216309, "rewards/margins": 0.6104851365089417, "rewards/rejected": 0.7190222144126892, "step": 2205 }, { "epoch": 1.19, "learning_rate": 8.2390544632907e-08, "logits/chosen": -2.012971878051758, "logits/rejected": -2.2132794857025146, "logps/chosen": -2.0021705627441406, "logps/rejected": -1.8122410774230957, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.9378843307495117, "rewards/margins": -0.0024713873863220215, "rewards/rejected": 0.9403557181358337, "step": 2206 }, { "epoch": 1.19, "learning_rate": 8.237390544641484e-08, "logits/chosen": -2.1209089756011963, "logits/rejected": -2.114262819290161, "logps/chosen": -1.4689340591430664, "logps/rejected": -7.46623420715332, "loss": 0.4239, "rewards/accuracies": 1.0, "rewards/chosen": 1.1193921566009521, "rewards/margins": 0.638962984085083, "rewards/rejected": 0.48042917251586914, "step": 2207 }, { "epoch": 1.19, "learning_rate": 8.235726008438593e-08, "logits/chosen": -2.0165746212005615, "logits/rejected": -2.02005672454834, "logps/chosen": -0.6946569085121155, "logps/rejected": -2.9110469818115234, "loss": 0.5097, "rewards/accuracies": 1.0, "rewards/chosen": 0.9877774119377136, "rewards/margins": 0.40839773416519165, "rewards/rejected": 0.579379677772522, "step": 2208 }, { "epoch": 1.19, "learning_rate": 8.234060854999545e-08, "logits/chosen": -1.9267220497131348, "logits/rejected": -2.1939339637756348, "logps/chosen": -0.327633798122406, "logps/rejected": -0.3258981704711914, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9723164439201355, "rewards/margins": -0.008704304695129395, "rewards/rejected": 0.9810207486152649, "step": 2209 }, { "epoch": 1.19, "learning_rate": 8.232395084641981e-08, "logits/chosen": -2.0018630027770996, "logits/rejected": -2.306004524230957, "logps/chosen": -2.8393936157226562, "logps/rejected": -2.51125168800354, "loss": 0.7093, "rewards/accuracies": 0.0, "rewards/chosen": 0.5630913972854614, "rewards/margins": -0.032008469104766846, "rewards/rejected": 0.5950998663902283, "step": 2210 }, { "epoch": 1.19, "learning_rate": 8.230728697683658e-08, "logits/chosen": -1.9659972190856934, "logits/rejected": -2.267146348953247, "logps/chosen": -1.5564650297164917, "logps/rejected": -1.5088975429534912, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 1.0672053098678589, "rewards/margins": 0.03751957416534424, "rewards/rejected": 1.0296857357025146, "step": 2211 }, { "epoch": 1.19, "learning_rate": 8.229061694442449e-08, "logits/chosen": -2.027235269546509, "logits/rejected": -2.298579216003418, "logps/chosen": -3.146371364593506, "logps/rejected": -2.6510140895843506, "loss": 0.7056, "rewards/accuracies": 0.0, "rewards/chosen": 0.8380630612373352, "rewards/margins": -0.024717509746551514, "rewards/rejected": 0.8627805709838867, "step": 2212 }, { "epoch": 1.19, "learning_rate": 8.227394075236346e-08, "logits/chosen": -2.000077486038208, "logits/rejected": -1.9832687377929688, "logps/chosen": -32.60173797607422, "logps/rejected": -8.696333885192871, "loss": 0.2917, "rewards/accuracies": 1.0, "rewards/chosen": 1.4300121068954468, "rewards/margins": 1.0827364921569824, "rewards/rejected": 0.34727564454078674, "step": 2213 }, { "epoch": 1.19, "learning_rate": 8.22572584038346e-08, "logits/chosen": -2.069838523864746, "logits/rejected": -2.2640488147735596, "logps/chosen": -2.103835344314575, "logps/rejected": -1.9916701316833496, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.8067899942398071, "rewards/margins": 0.008538305759429932, "rewards/rejected": 0.7982516884803772, "step": 2214 }, { "epoch": 1.19, "learning_rate": 8.224056990202016e-08, "logits/chosen": -2.0202317237854004, "logits/rejected": -2.213878870010376, "logps/chosen": -10.099294662475586, "logps/rejected": -8.250246047973633, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.533178448677063, "rewards/margins": 0.005565941333770752, "rewards/rejected": 0.5276125073432922, "step": 2215 }, { "epoch": 1.2, "learning_rate": 8.222387525010357e-08, "logits/chosen": -1.9910831451416016, "logits/rejected": -1.994476556777954, "logps/chosen": -3.128413677215576, "logps/rejected": -6.136429309844971, "loss": 0.5191, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110248684883118, "rewards/margins": 0.3849213123321533, "rewards/rejected": 0.5261035561561584, "step": 2216 }, { "epoch": 1.2, "learning_rate": 8.22071744512695e-08, "logits/chosen": -1.9988560676574707, "logits/rejected": -2.012239933013916, "logps/chosen": -30.00030517578125, "logps/rejected": -21.163089752197266, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 0.7914852499961853, "rewards/margins": 0.40193235874176025, "rewards/rejected": 0.38955289125442505, "step": 2217 }, { "epoch": 1.2, "learning_rate": 8.219046750870366e-08, "logits/chosen": -1.9930403232574463, "logits/rejected": -2.227574110031128, "logps/chosen": -0.4399818181991577, "logps/rejected": -0.448617160320282, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.9138206839561462, "rewards/margins": 0.006227433681488037, "rewards/rejected": 0.9075932502746582, "step": 2218 }, { "epoch": 1.2, "learning_rate": 8.217375442559308e-08, "logits/chosen": -2.074711322784424, "logits/rejected": -2.2682583332061768, "logps/chosen": -0.4137892723083496, "logps/rejected": -0.48324015736579895, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.8600689172744751, "rewards/margins": 0.010616481304168701, "rewards/rejected": 0.8494524359703064, "step": 2219 }, { "epoch": 1.2, "learning_rate": 8.215703520512587e-08, "logits/chosen": -2.0166537761688232, "logits/rejected": -2.0091748237609863, "logps/chosen": -14.868236541748047, "logps/rejected": -6.048943996429443, "loss": 0.3719, "rewards/accuracies": 1.0, "rewards/chosen": 1.5244112014770508, "rewards/margins": 0.797309935092926, "rewards/rejected": 0.7271012663841248, "step": 2220 }, { "epoch": 1.2, "learning_rate": 8.21403098504913e-08, "logits/chosen": -2.063185691833496, "logits/rejected": -2.237227201461792, "logps/chosen": -1.0272704362869263, "logps/rejected": -4.173983097076416, "loss": 0.5813, "rewards/accuracies": 1.0, "rewards/chosen": 0.8449975252151489, "rewards/margins": 0.23785632848739624, "rewards/rejected": 0.6071411967277527, "step": 2221 }, { "epoch": 1.2, "learning_rate": 8.212357836487988e-08, "logits/chosen": -2.1995198726654053, "logits/rejected": -2.2888851165771484, "logps/chosen": -18.1879825592041, "logps/rejected": -9.638492584228516, "loss": 0.5961, "rewards/accuracies": 1.0, "rewards/chosen": 1.0474170446395874, "rewards/margins": 0.204490065574646, "rewards/rejected": 0.8429269790649414, "step": 2222 }, { "epoch": 1.2, "learning_rate": 8.210684075148325e-08, "logits/chosen": -1.9647859334945679, "logits/rejected": -1.9650384187698364, "logps/chosen": -1.7845673561096191, "logps/rejected": -0.8615051507949829, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.8578460812568665, "rewards/margins": -0.005261600017547607, "rewards/rejected": 0.8631076812744141, "step": 2223 }, { "epoch": 1.2, "learning_rate": 8.209009701349418e-08, "logits/chosen": -1.9449942111968994, "logits/rejected": -2.2277491092681885, "logps/chosen": -8.019922256469727, "logps/rejected": -1.2534151077270508, "loss": 0.6064, "rewards/accuracies": 1.0, "rewards/chosen": 1.0949268341064453, "rewards/margins": 0.18168914318084717, "rewards/rejected": 0.9132376909255981, "step": 2224 }, { "epoch": 1.2, "learning_rate": 8.207334715410671e-08, "logits/chosen": -2.0624022483825684, "logits/rejected": -2.2526214122772217, "logps/chosen": -0.7054667472839355, "logps/rejected": -0.7090533375740051, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.8236633539199829, "rewards/margins": 0.029373526573181152, "rewards/rejected": 0.7942898273468018, "step": 2225 }, { "epoch": 1.2, "learning_rate": 8.205659117651594e-08, "logits/chosen": -2.1336889266967773, "logits/rejected": -2.0019707679748535, "logps/chosen": -35.18791580200195, "logps/rejected": -1.834403395652771, "loss": 0.2237, "rewards/accuracies": 1.0, "rewards/chosen": 2.116030216217041, "rewards/margins": 1.383538007736206, "rewards/rejected": 0.7324921488761902, "step": 2226 }, { "epoch": 1.2, "learning_rate": 8.20398290839182e-08, "logits/chosen": -1.979860782623291, "logits/rejected": -2.29966402053833, "logps/chosen": -0.9448663592338562, "logps/rejected": -0.9680461883544922, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.9925130009651184, "rewards/margins": -0.0011499524116516113, "rewards/rejected": 0.99366295337677, "step": 2227 }, { "epoch": 1.2, "learning_rate": 8.202306087951097e-08, "logits/chosen": -2.042614459991455, "logits/rejected": -2.0420284271240234, "logps/chosen": -3.2394073009490967, "logps/rejected": -3.932097911834717, "loss": 0.2989, "rewards/accuracies": 1.0, "rewards/chosen": 1.5505985021591187, "rewards/margins": 1.0546389818191528, "rewards/rejected": 0.4959595203399658, "step": 2228 }, { "epoch": 1.2, "learning_rate": 8.200628656649289e-08, "logits/chosen": -2.0963971614837646, "logits/rejected": -2.2990386486053467, "logps/chosen": -3.94417667388916, "logps/rejected": -3.854492664337158, "loss": 0.6986, "rewards/accuracies": 0.0, "rewards/chosen": 0.9596417546272278, "rewards/margins": -0.010898888111114502, "rewards/rejected": 0.9705406427383423, "step": 2229 }, { "epoch": 1.2, "learning_rate": 8.198950614806378e-08, "logits/chosen": -1.9703959226608276, "logits/rejected": -2.013746976852417, "logps/chosen": -7.141430377960205, "logps/rejected": -8.354942321777344, "loss": 0.3336, "rewards/accuracies": 1.0, "rewards/chosen": 1.621654748916626, "rewards/margins": 0.9265395402908325, "rewards/rejected": 0.6951152086257935, "step": 2230 }, { "epoch": 1.2, "learning_rate": 8.197271962742462e-08, "logits/chosen": -2.029080867767334, "logits/rejected": -2.24990177154541, "logps/chosen": -1.323312759399414, "logps/rejected": -46.17237854003906, "loss": 0.2491, "rewards/accuracies": 1.0, "rewards/chosen": 0.8754624724388123, "rewards/margins": 1.2626428604125977, "rewards/rejected": -0.3871803283691406, "step": 2231 }, { "epoch": 1.2, "learning_rate": 8.195592700777753e-08, "logits/chosen": -2.0680782794952393, "logits/rejected": -2.3225038051605225, "logps/chosen": -0.33029884099960327, "logps/rejected": -0.39375045895576477, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8994776606559753, "rewards/margins": 0.012126684188842773, "rewards/rejected": 0.8873509764671326, "step": 2232 }, { "epoch": 1.2, "learning_rate": 8.193912829232583e-08, "logits/chosen": -2.0338737964630127, "logits/rejected": -2.0386838912963867, "logps/chosen": -2.0848140716552734, "logps/rejected": -1.8780494928359985, "loss": 0.4912, "rewards/accuracies": 1.0, "rewards/chosen": 1.15310537815094, "rewards/margins": 0.45531487464904785, "rewards/rejected": 0.6977905035018921, "step": 2233 }, { "epoch": 1.2, "learning_rate": 8.192232348427399e-08, "logits/chosen": -1.9462685585021973, "logits/rejected": -1.9547713994979858, "logps/chosen": -0.4494907855987549, "logps/rejected": -9.819182395935059, "loss": 0.3821, "rewards/accuracies": 1.0, "rewards/chosen": 1.0418124198913574, "rewards/margins": 0.7648971080780029, "rewards/rejected": 0.2769152820110321, "step": 2234 }, { "epoch": 1.21, "learning_rate": 8.190551258682761e-08, "logits/chosen": -2.1247684955596924, "logits/rejected": -2.122713327407837, "logps/chosen": -9.951772689819336, "logps/rejected": -2.554128646850586, "loss": 0.438, "rewards/accuracies": 1.0, "rewards/chosen": 1.2820218801498413, "rewards/margins": 0.5984370708465576, "rewards/rejected": 0.6835848093032837, "step": 2235 }, { "epoch": 1.21, "learning_rate": 8.188869560319351e-08, "logits/chosen": -2.1394479274749756, "logits/rejected": -2.2670297622680664, "logps/chosen": -5.172576904296875, "logps/rejected": -4.837993144989014, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8199692964553833, "rewards/margins": 0.012454688549041748, "rewards/rejected": 0.8075146079063416, "step": 2236 }, { "epoch": 1.21, "learning_rate": 8.187187253657963e-08, "logits/chosen": -2.0298643112182617, "logits/rejected": -2.0237529277801514, "logps/chosen": -2.1782889366149902, "logps/rejected": -7.006106376647949, "loss": 0.4942, "rewards/accuracies": 1.0, "rewards/chosen": 1.0898491144180298, "rewards/margins": 0.44746583700180054, "rewards/rejected": 0.6423832774162292, "step": 2237 }, { "epoch": 1.21, "learning_rate": 8.18550433901951e-08, "logits/chosen": -2.0306894779205322, "logits/rejected": -2.022655487060547, "logps/chosen": -3.094581127166748, "logps/rejected": -2.8193321228027344, "loss": 0.6056, "rewards/accuracies": 1.0, "rewards/chosen": 0.9476564526557922, "rewards/margins": 0.18351298570632935, "rewards/rejected": 0.7641434669494629, "step": 2238 }, { "epoch": 1.21, "learning_rate": 8.183820816725013e-08, "logits/chosen": -2.140022039413452, "logits/rejected": -2.147732734680176, "logps/chosen": -6.18507194519043, "logps/rejected": -2.9580817222595215, "loss": 0.7419, "rewards/accuracies": 0.0, "rewards/chosen": 1.1238657236099243, "rewards/margins": -0.09527528285980225, "rewards/rejected": 1.2191410064697266, "step": 2239 }, { "epoch": 1.21, "learning_rate": 8.182136687095623e-08, "logits/chosen": -2.036396026611328, "logits/rejected": -2.274614095687866, "logps/chosen": -0.8115300536155701, "logps/rejected": -0.7837299704551697, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.9123611450195312, "rewards/margins": 0.02872598171234131, "rewards/rejected": 0.8836351633071899, "step": 2240 }, { "epoch": 1.21, "learning_rate": 8.180451950452594e-08, "logits/chosen": -1.9932674169540405, "logits/rejected": -1.9995096921920776, "logps/chosen": -2.0421881675720215, "logps/rejected": -3.1618306636810303, "loss": 0.53, "rewards/accuracies": 1.0, "rewards/chosen": 0.935032069683075, "rewards/margins": 0.35810619592666626, "rewards/rejected": 0.5769258737564087, "step": 2241 }, { "epoch": 1.21, "learning_rate": 8.178766607117303e-08, "logits/chosen": -1.9897509813308716, "logits/rejected": -2.2867801189422607, "logps/chosen": -5.131250381469727, "logps/rejected": -5.029557228088379, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.7812668085098267, "rewards/margins": 0.012635350227355957, "rewards/rejected": 0.7686314582824707, "step": 2242 }, { "epoch": 1.21, "learning_rate": 8.177080657411238e-08, "logits/chosen": -2.064605951309204, "logits/rejected": -2.0633533000946045, "logps/chosen": -4.176511287689209, "logps/rejected": -3.2639353275299072, "loss": 0.5855, "rewards/accuracies": 1.0, "rewards/chosen": 0.8910889029502869, "rewards/margins": 0.2281988263130188, "rewards/rejected": 0.6628900766372681, "step": 2243 }, { "epoch": 1.21, "learning_rate": 8.175394101656005e-08, "logits/chosen": -2.0968079566955566, "logits/rejected": -2.273056983947754, "logps/chosen": -0.31701231002807617, "logps/rejected": -0.29041820764541626, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8966081738471985, "rewards/margins": 0.0020502805709838867, "rewards/rejected": 0.8945578932762146, "step": 2244 }, { "epoch": 1.21, "learning_rate": 8.17370694017333e-08, "logits/chosen": -2.1409177780151367, "logits/rejected": -2.118760108947754, "logps/chosen": -23.524723052978516, "logps/rejected": -15.446944236755371, "loss": 0.3395, "rewards/accuracies": 1.0, "rewards/chosen": 1.7467167377471924, "rewards/margins": 0.9058379530906677, "rewards/rejected": 0.8408787846565247, "step": 2245 }, { "epoch": 1.21, "learning_rate": 8.172019173285046e-08, "logits/chosen": -2.0631167888641357, "logits/rejected": -2.0594592094421387, "logps/chosen": -12.3151273727417, "logps/rejected": -11.417303085327148, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9681094288825989, "rewards/margins": 0.017755210399627686, "rewards/rejected": 0.9503542184829712, "step": 2246 }, { "epoch": 1.21, "learning_rate": 8.170330801313105e-08, "logits/chosen": -2.0145905017852783, "logits/rejected": -2.3271067142486572, "logps/chosen": -1.0160861015319824, "logps/rejected": -0.8797248005867004, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 0.9972001910209656, "rewards/margins": -0.02184373140335083, "rewards/rejected": 1.0190439224243164, "step": 2247 }, { "epoch": 1.21, "learning_rate": 8.168641824579578e-08, "logits/chosen": -1.9724268913269043, "logits/rejected": -2.251213550567627, "logps/chosen": -0.6552871465682983, "logps/rejected": -0.7475399971008301, "loss": 0.6673, "rewards/accuracies": 1.0, "rewards/chosen": 0.8110766410827637, "rewards/margins": 0.052413105964660645, "rewards/rejected": 0.758663535118103, "step": 2248 }, { "epoch": 1.21, "learning_rate": 8.16695224340665e-08, "logits/chosen": -1.997472882270813, "logits/rejected": -2.2524051666259766, "logps/chosen": -0.418836385011673, "logps/rejected": -0.4167075455188751, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.9798237085342407, "rewards/margins": 0.027476966381072998, "rewards/rejected": 0.9523467421531677, "step": 2249 }, { "epoch": 1.21, "learning_rate": 8.165262058116615e-08, "logits/chosen": -2.0926055908203125, "logits/rejected": -2.092910051345825, "logps/chosen": -1.7809582948684692, "logps/rejected": -0.928392767906189, "loss": 0.6547, "rewards/accuracies": 1.0, "rewards/chosen": 0.9439799189567566, "rewards/margins": 0.07848328351974487, "rewards/rejected": 0.8654966354370117, "step": 2250 }, { "epoch": 1.21, "learning_rate": 8.16357126903189e-08, "logits/chosen": -1.9610928297042847, "logits/rejected": -2.296600818634033, "logps/chosen": -0.32665762305259705, "logps/rejected": -0.3539312481880188, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9506852030754089, "rewards/margins": 0.009314239025115967, "rewards/rejected": 0.941370964050293, "step": 2251 }, { "epoch": 1.21, "learning_rate": 8.161879876475004e-08, "logits/chosen": -1.925763487815857, "logits/rejected": -1.9110125303268433, "logps/chosen": -8.549667358398438, "logps/rejected": -1.209165096282959, "loss": 0.5859, "rewards/accuracies": 1.0, "rewards/chosen": 1.288736343383789, "rewards/margins": 0.2274686098098755, "rewards/rejected": 1.0612677335739136, "step": 2252 }, { "epoch": 1.22, "learning_rate": 8.1601878807686e-08, "logits/chosen": -2.1532912254333496, "logits/rejected": -2.1503500938415527, "logps/chosen": -0.4018155038356781, "logps/rejected": -3.9494094848632812, "loss": 0.5142, "rewards/accuracies": 1.0, "rewards/chosen": 0.866352379322052, "rewards/margins": 0.3971138000488281, "rewards/rejected": 0.4692385792732239, "step": 2253 }, { "epoch": 1.22, "learning_rate": 8.158495282235439e-08, "logits/chosen": -2.0798707008361816, "logits/rejected": -2.252753973007202, "logps/chosen": -9.619951248168945, "logps/rejected": -10.760775566101074, "loss": 0.709, "rewards/accuracies": 0.0, "rewards/chosen": 0.9184600710868835, "rewards/margins": -0.03148013353347778, "rewards/rejected": 0.9499402046203613, "step": 2254 }, { "epoch": 1.22, "learning_rate": 8.156802081198395e-08, "logits/chosen": -2.0989990234375, "logits/rejected": -2.303659677505493, "logps/chosen": -0.820371687412262, "logps/rejected": -0.8458845019340515, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.973327100276947, "rewards/margins": 0.024047493934631348, "rewards/rejected": 0.9492796063423157, "step": 2255 }, { "epoch": 1.22, "learning_rate": 8.155108277980456e-08, "logits/chosen": -1.9374345541000366, "logits/rejected": -2.2937960624694824, "logps/chosen": -7.735732078552246, "logps/rejected": -7.8195648193359375, "loss": 0.6638, "rewards/accuracies": 1.0, "rewards/chosen": 1.053799033164978, "rewards/margins": 0.059610724449157715, "rewards/rejected": 0.9941883087158203, "step": 2256 }, { "epoch": 1.22, "learning_rate": 8.153413872904727e-08, "logits/chosen": -2.0705792903900146, "logits/rejected": -2.070356845855713, "logps/chosen": -1.315899133682251, "logps/rejected": -2.068634033203125, "loss": 0.6475, "rewards/accuracies": 1.0, "rewards/chosen": 1.0717238187789917, "rewards/margins": 0.09346503019332886, "rewards/rejected": 0.9782587885856628, "step": 2257 }, { "epoch": 1.22, "learning_rate": 8.151718866294428e-08, "logits/chosen": -1.9816478490829468, "logits/rejected": -2.2622487545013428, "logps/chosen": -1.8462368249893188, "logps/rejected": -10.724449157714844, "loss": 0.6317, "rewards/accuracies": 1.0, "rewards/chosen": 0.9456169009208679, "rewards/margins": 0.1269996166229248, "rewards/rejected": 0.8186172842979431, "step": 2258 }, { "epoch": 1.22, "learning_rate": 8.150023258472892e-08, "logits/chosen": -2.155062675476074, "logits/rejected": -2.2467286586761475, "logps/chosen": -1.8844232559204102, "logps/rejected": -1.7071402072906494, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.9262504577636719, "rewards/margins": 0.009559035301208496, "rewards/rejected": 0.9166914224624634, "step": 2259 }, { "epoch": 1.22, "learning_rate": 8.148327049763568e-08, "logits/chosen": -2.0451676845550537, "logits/rejected": -2.01680326461792, "logps/chosen": -15.44015121459961, "logps/rejected": -2.3115055561065674, "loss": 0.3302, "rewards/accuracies": 1.0, "rewards/chosen": 1.4765650033950806, "rewards/margins": 0.9383841753005981, "rewards/rejected": 0.5381808280944824, "step": 2260 }, { "epoch": 1.22, "learning_rate": 8.146630240490017e-08, "logits/chosen": -2.015641450881958, "logits/rejected": -2.0113730430603027, "logps/chosen": -3.31669282913208, "logps/rejected": -4.310216426849365, "loss": 0.3034, "rewards/accuracies": 1.0, "rewards/chosen": 1.4863470792770386, "rewards/margins": 1.037140965461731, "rewards/rejected": 0.4492061138153076, "step": 2261 }, { "epoch": 1.22, "learning_rate": 8.144932830975918e-08, "logits/chosen": -2.1400904655456543, "logits/rejected": -2.143059253692627, "logps/chosen": -2.444945812225342, "logps/rejected": -2.5063016414642334, "loss": 0.5377, "rewards/accuracies": 1.0, "rewards/chosen": 0.9940563440322876, "rewards/margins": 0.3395412564277649, "rewards/rejected": 0.6545150876045227, "step": 2262 }, { "epoch": 1.22, "learning_rate": 8.143234821545061e-08, "logits/chosen": -2.123211622238159, "logits/rejected": -2.3244740962982178, "logps/chosen": -1.602522373199463, "logps/rejected": -1.4785946607589722, "loss": 0.7027, "rewards/accuracies": 0.0, "rewards/chosen": 0.9702696204185486, "rewards/margins": -0.019012510776519775, "rewards/rejected": 0.9892821311950684, "step": 2263 }, { "epoch": 1.22, "learning_rate": 8.141536212521358e-08, "logits/chosen": -2.012807846069336, "logits/rejected": -2.0209431648254395, "logps/chosen": -1.0027813911437988, "logps/rejected": -4.985413074493408, "loss": 0.4972, "rewards/accuracies": 1.0, "rewards/chosen": 1.0434277057647705, "rewards/margins": 0.4399253726005554, "rewards/rejected": 0.6035023331642151, "step": 2264 }, { "epoch": 1.22, "learning_rate": 8.139837004228827e-08, "logits/chosen": -2.0240674018859863, "logits/rejected": -2.2690560817718506, "logps/chosen": -0.526585578918457, "logps/rejected": -0.5136145949363708, "loss": 0.6701, "rewards/accuracies": 1.0, "rewards/chosen": 0.8713335394859314, "rewards/margins": 0.04665732383728027, "rewards/rejected": 0.8246762156486511, "step": 2265 }, { "epoch": 1.22, "learning_rate": 8.1381371969916e-08, "logits/chosen": -2.0530993938446045, "logits/rejected": -2.045705795288086, "logps/chosen": -3.175185441970825, "logps/rejected": -4.0218505859375, "loss": 0.532, "rewards/accuracies": 1.0, "rewards/chosen": 0.978147029876709, "rewards/margins": 0.3534398078918457, "rewards/rejected": 0.6247072219848633, "step": 2266 }, { "epoch": 1.22, "learning_rate": 8.13643679113393e-08, "logits/chosen": -2.050625801086426, "logits/rejected": -2.247781991958618, "logps/chosen": -1.1048061847686768, "logps/rejected": -1.0613822937011719, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 0.9603444933891296, "rewards/margins": -0.018031418323516846, "rewards/rejected": 0.9783759117126465, "step": 2267 }, { "epoch": 1.22, "learning_rate": 8.134735786980181e-08, "logits/chosen": -2.0952470302581787, "logits/rejected": -2.0970535278320312, "logps/chosen": -0.7125217318534851, "logps/rejected": -3.0108392238616943, "loss": 0.5671, "rewards/accuracies": 1.0, "rewards/chosen": 0.7554222941398621, "rewards/margins": 0.27034586668014526, "rewards/rejected": 0.4850764274597168, "step": 2268 }, { "epoch": 1.22, "learning_rate": 8.133034184854828e-08, "logits/chosen": -2.1933889389038086, "logits/rejected": -2.1955978870391846, "logps/chosen": -3.833641767501831, "logps/rejected": -0.7887673377990723, "loss": 0.6209, "rewards/accuracies": 1.0, "rewards/chosen": 1.0986860990524292, "rewards/margins": 0.1501244306564331, "rewards/rejected": 0.9485616683959961, "step": 2269 }, { "epoch": 1.22, "learning_rate": 8.131331985082465e-08, "logits/chosen": -2.1318864822387695, "logits/rejected": -2.128734588623047, "logps/chosen": -3.2963247299194336, "logps/rejected": -5.284682273864746, "loss": 0.3428, "rewards/accuracies": 1.0, "rewards/chosen": 1.2817333936691284, "rewards/margins": 0.8941476345062256, "rewards/rejected": 0.38758572936058044, "step": 2270 }, { "epoch": 1.22, "learning_rate": 8.129629187987799e-08, "logits/chosen": -2.1214230060577393, "logits/rejected": -2.1100051403045654, "logps/chosen": -5.054150104522705, "logps/rejected": -7.730894565582275, "loss": 0.5272, "rewards/accuracies": 1.0, "rewards/chosen": 1.0341681241989136, "rewards/margins": 0.36504143476486206, "rewards/rejected": 0.6691266894340515, "step": 2271 }, { "epoch": 1.23, "learning_rate": 8.127925793895647e-08, "logits/chosen": -2.1172034740448, "logits/rejected": -2.307863473892212, "logps/chosen": -0.6152341365814209, "logps/rejected": -0.6240202188491821, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.961885929107666, "rewards/margins": 0.00897294282913208, "rewards/rejected": 0.9529129862785339, "step": 2272 }, { "epoch": 1.23, "learning_rate": 8.126221803130943e-08, "logits/chosen": -2.062446355819702, "logits/rejected": -2.0683839321136475, "logps/chosen": -2.2846126556396484, "logps/rejected": -2.8336668014526367, "loss": 0.424, "rewards/accuracies": 1.0, "rewards/chosen": 1.368851900100708, "rewards/margins": 0.6385031342506409, "rewards/rejected": 0.7303487658500671, "step": 2273 }, { "epoch": 1.23, "learning_rate": 8.124517216018736e-08, "logits/chosen": -2.12835693359375, "logits/rejected": -2.125220537185669, "logps/chosen": -3.0941214561462402, "logps/rejected": -2.123572587966919, "loss": 0.4955, "rewards/accuracies": 1.0, "rewards/chosen": 1.1800041198730469, "rewards/margins": 0.4441159963607788, "rewards/rejected": 0.7358881235122681, "step": 2274 }, { "epoch": 1.23, "learning_rate": 8.122812032884187e-08, "logits/chosen": -2.106901168823242, "logits/rejected": -2.2333638668060303, "logps/chosen": -2.3795783519744873, "logps/rejected": -2.372056722640991, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.9374804496765137, "rewards/margins": 0.00961989164352417, "rewards/rejected": 0.9278605580329895, "step": 2275 }, { "epoch": 1.23, "learning_rate": 8.121106254052571e-08, "logits/chosen": -2.0689878463745117, "logits/rejected": -2.2649240493774414, "logps/chosen": -0.5119078159332275, "logps/rejected": -0.628699004650116, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.8514381647109985, "rewards/margins": -0.004146575927734375, "rewards/rejected": 0.8555847406387329, "step": 2276 }, { "epoch": 1.23, "learning_rate": 8.119399879849276e-08, "logits/chosen": -2.1173741817474365, "logits/rejected": -2.243837594985962, "logps/chosen": -0.9609412550926208, "logps/rejected": -0.9401165246963501, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.996989905834198, "rewards/margins": -0.004785478115081787, "rewards/rejected": 1.0017753839492798, "step": 2277 }, { "epoch": 1.23, "learning_rate": 8.117692910599805e-08, "logits/chosen": -2.084683895111084, "logits/rejected": -2.0483241081237793, "logps/chosen": -5.823962211608887, "logps/rejected": -1.9223392009735107, "loss": 0.3701, "rewards/accuracies": 1.0, "rewards/chosen": 1.5430740118026733, "rewards/margins": 0.8033543229103088, "rewards/rejected": 0.7397196888923645, "step": 2278 }, { "epoch": 1.23, "learning_rate": 8.115985346629773e-08, "logits/chosen": -1.99612295627594, "logits/rejected": -1.9911601543426514, "logps/chosen": -3.144211530685425, "logps/rejected": -4.414229393005371, "loss": 0.3271, "rewards/accuracies": 1.0, "rewards/chosen": 1.4255095720291138, "rewards/margins": 0.9495313763618469, "rewards/rejected": 0.47597819566726685, "step": 2279 }, { "epoch": 1.23, "learning_rate": 8.114277188264909e-08, "logits/chosen": -2.167772054672241, "logits/rejected": -2.354335069656372, "logps/chosen": -1.636960506439209, "logps/rejected": -1.6448473930358887, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9547659158706665, "rewards/margins": 0.013251125812530518, "rewards/rejected": 0.941514790058136, "step": 2280 }, { "epoch": 1.23, "learning_rate": 8.112568435831055e-08, "logits/chosen": -1.9531865119934082, "logits/rejected": -2.2868528366088867, "logps/chosen": -1.8886702060699463, "logps/rejected": -2.0829267501831055, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.7246265411376953, "rewards/margins": 0.01399928331375122, "rewards/rejected": 0.7106272578239441, "step": 2281 }, { "epoch": 1.23, "learning_rate": 8.110859089654169e-08, "logits/chosen": -2.2230312824249268, "logits/rejected": -2.0266926288604736, "logps/chosen": -41.35289764404297, "logps/rejected": -3.7126107215881348, "loss": 0.1487, "rewards/accuracies": 1.0, "rewards/chosen": 2.383335828781128, "rewards/margins": 1.8302205801010132, "rewards/rejected": 0.5531152486801147, "step": 2282 }, { "epoch": 1.23, "learning_rate": 8.10914915006032e-08, "logits/chosen": -2.091939926147461, "logits/rejected": -1.9757941961288452, "logps/chosen": -33.45275115966797, "logps/rejected": -2.985832452774048, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 1.9697704315185547, "rewards/margins": 1.432525634765625, "rewards/rejected": 0.5372447371482849, "step": 2283 }, { "epoch": 1.23, "learning_rate": 8.107438617375686e-08, "logits/chosen": -2.072183132171631, "logits/rejected": -2.230665445327759, "logps/chosen": -0.9453917741775513, "logps/rejected": -0.8891347646713257, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.8203882575035095, "rewards/margins": 0.005909264087677002, "rewards/rejected": 0.8144789934158325, "step": 2284 }, { "epoch": 1.23, "learning_rate": 8.105727491926566e-08, "logits/chosen": -2.0243759155273438, "logits/rejected": -2.032935619354248, "logps/chosen": -1.8086990118026733, "logps/rejected": -2.291553258895874, "loss": 0.5197, "rewards/accuracies": 1.0, "rewards/chosen": 1.017872929573059, "rewards/margins": 0.383365273475647, "rewards/rejected": 0.6345076560974121, "step": 2285 }, { "epoch": 1.23, "learning_rate": 8.104015774039369e-08, "logits/chosen": -2.135286569595337, "logits/rejected": -2.1227526664733887, "logps/chosen": -1.7764230966567993, "logps/rejected": -7.022838592529297, "loss": 0.3984, "rewards/accuracies": 1.0, "rewards/chosen": 1.1539983749389648, "rewards/margins": 0.7143900394439697, "rewards/rejected": 0.43960830569267273, "step": 2286 }, { "epoch": 1.23, "learning_rate": 8.102303464040616e-08, "logits/chosen": -2.0577802658081055, "logits/rejected": -2.2441532611846924, "logps/chosen": -0.5988785624504089, "logps/rejected": -0.565567135810852, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.8028551936149597, "rewards/margins": 0.011956393718719482, "rewards/rejected": 0.7908987998962402, "step": 2287 }, { "epoch": 1.23, "learning_rate": 8.100590562256939e-08, "logits/chosen": -1.9930424690246582, "logits/rejected": -1.9901118278503418, "logps/chosen": -1.2988165616989136, "logps/rejected": -4.883100509643555, "loss": 0.4323, "rewards/accuracies": 1.0, "rewards/chosen": 1.1200779676437378, "rewards/margins": 0.6147993803024292, "rewards/rejected": 0.5052785873413086, "step": 2288 }, { "epoch": 1.23, "learning_rate": 8.098877069015089e-08, "logits/chosen": -2.0819313526153564, "logits/rejected": -2.0212008953094482, "logps/chosen": -20.968852996826172, "logps/rejected": -3.068624496459961, "loss": 0.3415, "rewards/accuracies": 1.0, "rewards/chosen": 1.6147640943527222, "rewards/margins": 0.8988369703292847, "rewards/rejected": 0.7159271240234375, "step": 2289 }, { "epoch": 1.24, "learning_rate": 8.097162984641925e-08, "logits/chosen": -2.0914764404296875, "logits/rejected": -2.284043073654175, "logps/chosen": -3.6413440704345703, "logps/rejected": -3.764481782913208, "loss": 0.7049, "rewards/accuracies": 0.0, "rewards/chosen": 0.7185468673706055, "rewards/margins": -0.023397743701934814, "rewards/rejected": 0.7419446110725403, "step": 2290 }, { "epoch": 1.24, "learning_rate": 8.095448309464417e-08, "logits/chosen": -2.0797088146209717, "logits/rejected": -2.2769105434417725, "logps/chosen": -0.4748416841030121, "logps/rejected": -3.2399845123291016, "loss": 0.6191, "rewards/accuracies": 1.0, "rewards/chosen": 0.9780155420303345, "rewards/margins": 0.15408039093017578, "rewards/rejected": 0.8239351511001587, "step": 2291 }, { "epoch": 1.24, "learning_rate": 8.093733043809654e-08, "logits/chosen": -2.067537784576416, "logits/rejected": -2.2887048721313477, "logps/chosen": -15.005110740661621, "logps/rejected": -12.158105850219727, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 1.00560462474823, "rewards/margins": 0.06619477272033691, "rewards/rejected": 0.9394098520278931, "step": 2292 }, { "epoch": 1.24, "learning_rate": 8.092017188004836e-08, "logits/chosen": -2.086590051651001, "logits/rejected": -2.08566951751709, "logps/chosen": -1.2317252159118652, "logps/rejected": -3.1747028827667236, "loss": 0.4794, "rewards/accuracies": 1.0, "rewards/chosen": 1.1124862432479858, "rewards/margins": 0.48587673902511597, "rewards/rejected": 0.6266095042228699, "step": 2293 }, { "epoch": 1.24, "learning_rate": 8.090300742377268e-08, "logits/chosen": -2.2408928871154785, "logits/rejected": -2.1969587802886963, "logps/chosen": -37.84626770019531, "logps/rejected": -11.737642288208008, "loss": 0.2247, "rewards/accuracies": 1.0, "rewards/chosen": 2.1251473426818848, "rewards/margins": 1.3783364295959473, "rewards/rejected": 0.7468109130859375, "step": 2294 }, { "epoch": 1.24, "learning_rate": 8.088583707254378e-08, "logits/chosen": -2.034820795059204, "logits/rejected": -2.308750629425049, "logps/chosen": -0.5664911270141602, "logps/rejected": -0.49776700139045715, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.9149948358535767, "rewards/margins": 0.03480195999145508, "rewards/rejected": 0.8801928758621216, "step": 2295 }, { "epoch": 1.24, "learning_rate": 8.0868660829637e-08, "logits/chosen": -2.0483829975128174, "logits/rejected": -2.2576491832733154, "logps/chosen": -0.32756340503692627, "logps/rejected": -0.37423980236053467, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 1.0252128839492798, "rewards/margins": 0.022104978561401367, "rewards/rejected": 1.0031079053878784, "step": 2296 }, { "epoch": 1.24, "learning_rate": 8.085147869832882e-08, "logits/chosen": -2.076979160308838, "logits/rejected": -2.3212168216705322, "logps/chosen": -7.6080121994018555, "logps/rejected": -15.323888778686523, "loss": 0.6446, "rewards/accuracies": 1.0, "rewards/chosen": 0.9167680144309998, "rewards/margins": 0.09963732957839966, "rewards/rejected": 0.8171306848526001, "step": 2297 }, { "epoch": 1.24, "learning_rate": 8.083429068189686e-08, "logits/chosen": -2.107614040374756, "logits/rejected": -2.117238998413086, "logps/chosen": -2.0322394371032715, "logps/rejected": -2.4153859615325928, "loss": 0.4958, "rewards/accuracies": 1.0, "rewards/chosen": 1.1488577127456665, "rewards/margins": 0.44345617294311523, "rewards/rejected": 0.7054015398025513, "step": 2298 }, { "epoch": 1.24, "learning_rate": 8.081709678361984e-08, "logits/chosen": -2.0911688804626465, "logits/rejected": -2.3238494396209717, "logps/chosen": -0.5901928544044495, "logps/rejected": -0.6433935165405273, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.8590942621231079, "rewards/margins": 0.006309211254119873, "rewards/rejected": 0.852785050868988, "step": 2299 }, { "epoch": 1.24, "learning_rate": 8.07998970067776e-08, "logits/chosen": -2.1473894119262695, "logits/rejected": -2.2965190410614014, "logps/chosen": -2.360105037689209, "logps/rejected": -2.1338396072387695, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.5772672295570374, "rewards/margins": -0.007660329341888428, "rewards/rejected": 0.5849275588989258, "step": 2300 }, { "epoch": 1.24, "learning_rate": 8.078269135465113e-08, "logits/chosen": -2.0453338623046875, "logits/rejected": -2.047002077102661, "logps/chosen": -5.340023040771484, "logps/rejected": -3.3000242710113525, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": 2.358635902404785, "rewards/margins": 1.8695447444915771, "rewards/rejected": 0.4890912175178528, "step": 2301 }, { "epoch": 1.24, "learning_rate": 8.076547983052251e-08, "logits/chosen": -1.9643397331237793, "logits/rejected": -2.25447416305542, "logps/chosen": -0.19193130731582642, "logps/rejected": -0.21660566329956055, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.8273751139640808, "rewards/margins": 0.013705313205718994, "rewards/rejected": 0.8136698007583618, "step": 2302 }, { "epoch": 1.24, "learning_rate": 8.074826243767496e-08, "logits/chosen": -1.9714019298553467, "logits/rejected": -2.306417226791382, "logps/chosen": -0.642023503780365, "logps/rejected": -0.6934328079223633, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 1.0634628534317017, "rewards/margins": 0.014678478240966797, "rewards/rejected": 1.0487843751907349, "step": 2303 }, { "epoch": 1.24, "learning_rate": 8.07310391793928e-08, "logits/chosen": -1.9764187335968018, "logits/rejected": -1.9327961206436157, "logps/chosen": -11.946416854858398, "logps/rejected": -1.7282634973526, "loss": 0.5285, "rewards/accuracies": 1.0, "rewards/chosen": 1.3021763563156128, "rewards/margins": 0.36191433668136597, "rewards/rejected": 0.9402620196342468, "step": 2304 }, { "epoch": 1.24, "learning_rate": 8.071381005896149e-08, "logits/chosen": -2.093266725540161, "logits/rejected": -2.0958845615386963, "logps/chosen": -0.7386761903762817, "logps/rejected": -2.3268086910247803, "loss": 0.5471, "rewards/accuracies": 1.0, "rewards/chosen": 0.9349994659423828, "rewards/margins": 0.31719958782196045, "rewards/rejected": 0.6177998781204224, "step": 2305 }, { "epoch": 1.24, "learning_rate": 8.069657507966758e-08, "logits/chosen": -2.029046058654785, "logits/rejected": -2.0403363704681396, "logps/chosen": -3.2466013431549072, "logps/rejected": -2.3048503398895264, "loss": 0.4809, "rewards/accuracies": 1.0, "rewards/chosen": 1.0794813632965088, "rewards/margins": 0.48213446140289307, "rewards/rejected": 0.5973469018936157, "step": 2306 }, { "epoch": 1.24, "learning_rate": 8.06793342447988e-08, "logits/chosen": -1.98008394241333, "logits/rejected": -1.988709807395935, "logps/chosen": -1.7417477369308472, "logps/rejected": -5.165453910827637, "loss": 0.3627, "rewards/accuracies": 1.0, "rewards/chosen": 1.467544436454773, "rewards/margins": 0.8272863030433655, "rewards/rejected": 0.6402581334114075, "step": 2307 }, { "epoch": 1.24, "learning_rate": 8.06620875576439e-08, "logits/chosen": -2.119845390319824, "logits/rejected": -2.110952854156494, "logps/chosen": -5.141053676605225, "logps/rejected": -2.2364537715911865, "loss": 0.3937, "rewards/accuracies": 1.0, "rewards/chosen": 1.5813095569610596, "rewards/margins": 0.7287655472755432, "rewards/rejected": 0.8525440096855164, "step": 2308 }, { "epoch": 1.25, "learning_rate": 8.064483502149284e-08, "logits/chosen": -2.075157880783081, "logits/rejected": -2.3192801475524902, "logps/chosen": -4.599298477172852, "logps/rejected": -4.690874099731445, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 1.2949438095092773, "rewards/margins": 0.020778775215148926, "rewards/rejected": 1.2741650342941284, "step": 2309 }, { "epoch": 1.25, "learning_rate": 8.062757663963664e-08, "logits/chosen": -2.047462224960327, "logits/rejected": -2.0514650344848633, "logps/chosen": -0.6309208869934082, "logps/rejected": -4.44703483581543, "loss": 0.5324, "rewards/accuracies": 1.0, "rewards/chosen": 0.8236408233642578, "rewards/margins": 0.3522930145263672, "rewards/rejected": 0.4713478088378906, "step": 2310 }, { "epoch": 1.25, "learning_rate": 8.061031241536745e-08, "logits/chosen": -2.11885142326355, "logits/rejected": -2.116234302520752, "logps/chosen": -6.088198184967041, "logps/rejected": -3.250875949859619, "loss": 0.3914, "rewards/accuracies": 1.0, "rewards/chosen": 1.3951116800308228, "rewards/margins": 0.7359400391578674, "rewards/rejected": 0.6591716408729553, "step": 2311 }, { "epoch": 1.25, "learning_rate": 8.059304235197853e-08, "logits/chosen": -2.1109821796417236, "logits/rejected": -2.283000946044922, "logps/chosen": -1.1085929870605469, "logps/rejected": -1.1098754405975342, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9522578120231628, "rewards/margins": 0.02165275812149048, "rewards/rejected": 0.9306050539016724, "step": 2312 }, { "epoch": 1.25, "learning_rate": 8.057576645276427e-08, "logits/chosen": -1.9480801820755005, "logits/rejected": -2.254835605621338, "logps/chosen": -1.3980557918548584, "logps/rejected": -1.3227781057357788, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.9615039825439453, "rewards/margins": -0.005574345588684082, "rewards/rejected": 0.9670783281326294, "step": 2313 }, { "epoch": 1.25, "learning_rate": 8.055848472102014e-08, "logits/chosen": -2.0380306243896484, "logits/rejected": -2.042705774307251, "logps/chosen": -7.902067184448242, "logps/rejected": -7.658363342285156, "loss": 0.3027, "rewards/accuracies": 1.0, "rewards/chosen": 1.7111437320709229, "rewards/margins": 1.0397741794586182, "rewards/rejected": 0.6713694930076599, "step": 2314 }, { "epoch": 1.25, "learning_rate": 8.054119716004278e-08, "logits/chosen": -2.1196117401123047, "logits/rejected": -2.0212574005126953, "logps/chosen": -26.629322052001953, "logps/rejected": -3.5380337238311768, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 1.9275718927383423, "rewards/margins": 1.3593716621398926, "rewards/rejected": 0.5682002305984497, "step": 2315 }, { "epoch": 1.25, "learning_rate": 8.05239037731299e-08, "logits/chosen": -2.118497610092163, "logits/rejected": -2.1161909103393555, "logps/chosen": -5.266515254974365, "logps/rejected": -2.4588356018066406, "loss": 0.3242, "rewards/accuracies": 1.0, "rewards/chosen": 1.6113251447677612, "rewards/margins": 0.9600918292999268, "rewards/rejected": 0.6512333154678345, "step": 2316 }, { "epoch": 1.25, "learning_rate": 8.05066045635803e-08, "logits/chosen": -2.184837579727173, "logits/rejected": -2.009838104248047, "logps/chosen": -26.089157104492188, "logps/rejected": -3.9897968769073486, "loss": 0.2329, "rewards/accuracies": 1.0, "rewards/chosen": 1.9461033344268799, "rewards/margins": 1.338672399520874, "rewards/rejected": 0.6074309349060059, "step": 2317 }, { "epoch": 1.25, "learning_rate": 8.048929953469393e-08, "logits/chosen": -2.1002728939056396, "logits/rejected": -2.1394057273864746, "logps/chosen": -3.637442111968994, "logps/rejected": -9.708242416381836, "loss": 0.4784, "rewards/accuracies": 1.0, "rewards/chosen": 1.3204790353775024, "rewards/margins": 0.48862648010253906, "rewards/rejected": 0.8318525552749634, "step": 2318 }, { "epoch": 1.25, "learning_rate": 8.047198868977185e-08, "logits/chosen": -2.0557029247283936, "logits/rejected": -2.277534008026123, "logps/chosen": -1.2146319150924683, "logps/rejected": -1.159278154373169, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.8133726119995117, "rewards/margins": 0.020479679107666016, "rewards/rejected": 0.7928929328918457, "step": 2319 }, { "epoch": 1.25, "learning_rate": 8.045467203211622e-08, "logits/chosen": -2.045055389404297, "logits/rejected": -2.2947819232940674, "logps/chosen": -0.6722633838653564, "logps/rejected": -0.6757918000221252, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9115277528762817, "rewards/margins": 0.02626192569732666, "rewards/rejected": 0.8852658271789551, "step": 2320 }, { "epoch": 1.25, "learning_rate": 8.043734956503028e-08, "logits/chosen": -2.0403966903686523, "logits/rejected": -2.0337181091308594, "logps/chosen": -5.678696632385254, "logps/rejected": -6.100210189819336, "loss": 0.3773, "rewards/accuracies": 1.0, "rewards/chosen": 1.408974051475525, "rewards/margins": 0.7802625298500061, "rewards/rejected": 0.6287115216255188, "step": 2321 }, { "epoch": 1.25, "learning_rate": 8.042002129181844e-08, "logits/chosen": -2.096829891204834, "logits/rejected": -2.092752456665039, "logps/chosen": -7.731015205383301, "logps/rejected": -3.559723138809204, "loss": 0.3634, "rewards/accuracies": 1.0, "rewards/chosen": 1.3238552808761597, "rewards/margins": 0.8251296281814575, "rewards/rejected": 0.49872562289237976, "step": 2322 }, { "epoch": 1.25, "learning_rate": 8.040268721578618e-08, "logits/chosen": -2.0576300621032715, "logits/rejected": -2.0480709075927734, "logps/chosen": -4.471599578857422, "logps/rejected": -2.159158945083618, "loss": 0.3968, "rewards/accuracies": 1.0, "rewards/chosen": 1.5936301946640015, "rewards/margins": 0.7192222476005554, "rewards/rejected": 0.874407947063446, "step": 2323 }, { "epoch": 1.25, "learning_rate": 8.038534734024006e-08, "logits/chosen": -2.0613765716552734, "logits/rejected": -2.2300891876220703, "logps/chosen": -7.866953372955322, "logps/rejected": -1.0552345514297485, "loss": 0.7076, "rewards/accuracies": 0.0, "rewards/chosen": 0.7884034514427185, "rewards/margins": -0.02878880500793457, "rewards/rejected": 0.8171922564506531, "step": 2324 }, { "epoch": 1.25, "learning_rate": 8.03680016684878e-08, "logits/chosen": -2.098390817642212, "logits/rejected": -2.1264607906341553, "logps/chosen": -3.9518911838531494, "logps/rejected": -11.780511856079102, "loss": 0.3122, "rewards/accuracies": 1.0, "rewards/chosen": 1.4806822538375854, "rewards/margins": 1.0041004419326782, "rewards/rejected": 0.47658178210258484, "step": 2325 }, { "epoch": 1.25, "learning_rate": 8.035065020383819e-08, "logits/chosen": -2.1816399097442627, "logits/rejected": -2.188807487487793, "logps/chosen": -1.7727664709091187, "logps/rejected": -1.8312841653823853, "loss": 0.4921, "rewards/accuracies": 1.0, "rewards/chosen": 1.0293785333633423, "rewards/margins": 0.4530491232872009, "rewards/rejected": 0.5763294100761414, "step": 2326 }, { "epoch": 1.26, "learning_rate": 8.033329294960114e-08, "logits/chosen": -2.0426180362701416, "logits/rejected": -2.242711067199707, "logps/chosen": -0.5583139061927795, "logps/rejected": -0.5868254899978638, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9756469130516052, "rewards/margins": 0.018288135528564453, "rewards/rejected": 0.9573587775230408, "step": 2327 }, { "epoch": 1.26, "learning_rate": 8.031592990908768e-08, "logits/chosen": -2.0638983249664307, "logits/rejected": -2.282698631286621, "logps/chosen": -5.175142765045166, "logps/rejected": -0.9713166356086731, "loss": 0.9199, "rewards/accuracies": 0.0, "rewards/chosen": 0.671118438243866, "rewards/margins": -0.4114602208137512, "rewards/rejected": 1.0825786590576172, "step": 2328 }, { "epoch": 1.26, "learning_rate": 8.029856108560989e-08, "logits/chosen": -2.1552329063415527, "logits/rejected": -2.156144380569458, "logps/chosen": -1.3261315822601318, "logps/rejected": -2.280344247817993, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.9406588673591614, "rewards/margins": 0.03363943099975586, "rewards/rejected": 0.9070194363594055, "step": 2329 }, { "epoch": 1.26, "learning_rate": 8.028118648248101e-08, "logits/chosen": -2.0673718452453613, "logits/rejected": -2.2898387908935547, "logps/chosen": -2.520031452178955, "logps/rejected": -2.510869026184082, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.9555122256278992, "rewards/margins": 0.003412306308746338, "rewards/rejected": 0.9520999193191528, "step": 2330 }, { "epoch": 1.26, "learning_rate": 8.026380610301535e-08, "logits/chosen": -2.0297601222991943, "logits/rejected": -2.024332284927368, "logps/chosen": -6.408890724182129, "logps/rejected": -2.223443031311035, "loss": 0.528, "rewards/accuracies": 1.0, "rewards/chosen": 1.0770680904388428, "rewards/margins": 0.3631727695465088, "rewards/rejected": 0.713895320892334, "step": 2331 }, { "epoch": 1.26, "learning_rate": 8.024641995052836e-08, "logits/chosen": -1.9537547826766968, "logits/rejected": -1.960512399673462, "logps/chosen": -3.0153849124908447, "logps/rejected": -4.749333381652832, "loss": 0.4364, "rewards/accuracies": 1.0, "rewards/chosen": 1.1913013458251953, "rewards/margins": 0.6030457615852356, "rewards/rejected": 0.5882555842399597, "step": 2332 }, { "epoch": 1.26, "learning_rate": 8.022902802833653e-08, "logits/chosen": -2.1014511585235596, "logits/rejected": -2.1988556385040283, "logps/chosen": -1.3631001710891724, "logps/rejected": -1.260322093963623, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 1.0670452117919922, "rewards/margins": 0.005551934242248535, "rewards/rejected": 1.0614932775497437, "step": 2333 }, { "epoch": 1.26, "learning_rate": 8.021163033975749e-08, "logits/chosen": -2.0567855834960938, "logits/rejected": -2.303084373474121, "logps/chosen": -0.6322458386421204, "logps/rejected": -0.6135628819465637, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.802157998085022, "rewards/margins": -0.006451308727264404, "rewards/rejected": 0.8086093068122864, "step": 2334 }, { "epoch": 1.26, "learning_rate": 8.019422688810998e-08, "logits/chosen": -2.0073511600494385, "logits/rejected": -2.009507894515991, "logps/chosen": -2.1261026859283447, "logps/rejected": -7.034220218658447, "loss": 0.4064, "rewards/accuracies": 1.0, "rewards/chosen": 0.9580000042915344, "rewards/margins": 0.6904417276382446, "rewards/rejected": 0.2675582468509674, "step": 2335 }, { "epoch": 1.26, "learning_rate": 8.01768176767138e-08, "logits/chosen": -2.084526300430298, "logits/rejected": -2.252474784851074, "logps/chosen": -1.1416898965835571, "logps/rejected": -1.1999355554580688, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.7457260489463806, "rewards/margins": 0.020440280437469482, "rewards/rejected": 0.7252857685089111, "step": 2336 }, { "epoch": 1.26, "learning_rate": 8.01594027088899e-08, "logits/chosen": -2.0192837715148926, "logits/rejected": -1.9984586238861084, "logps/chosen": -4.897010803222656, "logps/rejected": -5.345055103302002, "loss": 0.2886, "rewards/accuracies": 1.0, "rewards/chosen": 1.4992417097091675, "rewards/margins": 1.094926118850708, "rewards/rejected": 0.40431562066078186, "step": 2337 }, { "epoch": 1.26, "learning_rate": 8.014198198796029e-08, "logits/chosen": -2.1857268810272217, "logits/rejected": -2.1351916790008545, "logps/chosen": -16.955171585083008, "logps/rejected": -4.081511497497559, "loss": 0.2897, "rewards/accuracies": 1.0, "rewards/chosen": 1.6323095560073853, "rewards/margins": 1.090654969215393, "rewards/rejected": 0.5416545867919922, "step": 2338 }, { "epoch": 1.26, "learning_rate": 8.012455551724809e-08, "logits/chosen": -2.108952760696411, "logits/rejected": -2.0417513847351074, "logps/chosen": -12.245196342468262, "logps/rejected": -7.608022689819336, "loss": 0.7894, "rewards/accuracies": 0.0, "rewards/chosen": 0.48574763536453247, "rewards/margins": -0.18404054641723633, "rewards/rejected": 0.6697881817817688, "step": 2339 }, { "epoch": 1.26, "learning_rate": 8.010712330007752e-08, "logits/chosen": -1.9847664833068848, "logits/rejected": -2.253805160522461, "logps/chosen": -0.3247305154800415, "logps/rejected": -0.3502587378025055, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.8735870718955994, "rewards/margins": 0.006754815578460693, "rewards/rejected": 0.8668322563171387, "step": 2340 }, { "epoch": 1.26, "learning_rate": 8.008968533977385e-08, "logits/chosen": -2.0516233444213867, "logits/rejected": -2.041689157485962, "logps/chosen": -4.1484270095825195, "logps/rejected": -6.469944000244141, "loss": 0.3553, "rewards/accuracies": 1.0, "rewards/chosen": 1.2169005870819092, "rewards/margins": 0.8517851233482361, "rewards/rejected": 0.3651154637336731, "step": 2341 }, { "epoch": 1.26, "learning_rate": 8.007224163966352e-08, "logits/chosen": -1.9351909160614014, "logits/rejected": -2.22652268409729, "logps/chosen": -0.525754988193512, "logps/rejected": -0.5010465383529663, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8155104517936707, "rewards/margins": 0.01977825164794922, "rewards/rejected": 0.7957322001457214, "step": 2342 }, { "epoch": 1.26, "learning_rate": 8.005479220307406e-08, "logits/chosen": -2.1851084232330322, "logits/rejected": -2.1802749633789062, "logps/chosen": -6.279256820678711, "logps/rejected": -6.493786811828613, "loss": 0.3815, "rewards/accuracies": 1.0, "rewards/chosen": 1.2061207294464111, "rewards/margins": 0.7668280601501465, "rewards/rejected": 0.43929263949394226, "step": 2343 }, { "epoch": 1.26, "learning_rate": 8.003733703333401e-08, "logits/chosen": -2.057481288909912, "logits/rejected": -2.1100547313690186, "logps/chosen": -5.09095573425293, "logps/rejected": -14.978143692016602, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": 1.5118064880371094, "rewards/margins": 1.2428086996078491, "rewards/rejected": 0.26899775862693787, "step": 2344 }, { "epoch": 1.26, "learning_rate": 8.001987613377309e-08, "logits/chosen": -2.231980323791504, "logits/rejected": -2.2917261123657227, "logps/chosen": -8.299152374267578, "logps/rejected": -26.75791358947754, "loss": 0.4314, "rewards/accuracies": 1.0, "rewards/chosen": 1.1652435064315796, "rewards/margins": 0.6172864437103271, "rewards/rejected": 0.5479570627212524, "step": 2345 }, { "epoch": 1.27, "learning_rate": 8.000240950772206e-08, "logits/chosen": -1.9663997888565063, "logits/rejected": -2.2462339401245117, "logps/chosen": -2.3321378231048584, "logps/rejected": -2.42494535446167, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 1.0337591171264648, "rewards/margins": 0.006907224655151367, "rewards/rejected": 1.0268518924713135, "step": 2346 }, { "epoch": 1.27, "learning_rate": 7.998493715851283e-08, "logits/chosen": -2.058797836303711, "logits/rejected": -2.268237829208374, "logps/chosen": -2.17747163772583, "logps/rejected": -2.076629400253296, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.8867353796958923, "rewards/margins": 0.004607141017913818, "rewards/rejected": 0.8821282386779785, "step": 2347 }, { "epoch": 1.27, "learning_rate": 7.996745908947834e-08, "logits/chosen": -2.1426074504852295, "logits/rejected": -2.143378496170044, "logps/chosen": -1.3801050186157227, "logps/rejected": -2.981877088546753, "loss": 0.5263, "rewards/accuracies": 1.0, "rewards/chosen": 1.023175597190857, "rewards/margins": 0.36718058586120605, "rewards/rejected": 0.6559950113296509, "step": 2348 }, { "epoch": 1.27, "learning_rate": 7.994997530395264e-08, "logits/chosen": -2.1054022312164307, "logits/rejected": -2.3167905807495117, "logps/chosen": -0.5341396331787109, "logps/rejected": -0.5700719356536865, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9663355946540833, "rewards/margins": 0.019251704216003418, "rewards/rejected": 0.9470838904380798, "step": 2349 }, { "epoch": 1.27, "learning_rate": 7.993248580527092e-08, "logits/chosen": -2.083930253982544, "logits/rejected": -2.314018964767456, "logps/chosen": -0.31790438294410706, "logps/rejected": -0.2899545133113861, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.8925881385803223, "rewards/margins": 0.04638671875, "rewards/rejected": 0.8462014198303223, "step": 2350 }, { "epoch": 1.27, "learning_rate": 7.991499059676937e-08, "logits/chosen": -1.9922367334365845, "logits/rejected": -2.2719931602478027, "logps/chosen": -0.6656720042228699, "logps/rejected": -0.6547649502754211, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.9272192120552063, "rewards/margins": 0.019797027111053467, "rewards/rejected": 0.9074221849441528, "step": 2351 }, { "epoch": 1.27, "learning_rate": 7.989748968178535e-08, "logits/chosen": -1.979104995727539, "logits/rejected": -2.2467551231384277, "logps/chosen": -1.8621106147766113, "logps/rejected": -1.9354462623596191, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.9548671841621399, "rewards/margins": -0.006484389305114746, "rewards/rejected": 0.9613515734672546, "step": 2352 }, { "epoch": 1.27, "learning_rate": 7.987998306365724e-08, "logits/chosen": -2.035370111465454, "logits/rejected": -2.0243875980377197, "logps/chosen": -4.095523357391357, "logps/rejected": -1.8601912260055542, "loss": 0.393, "rewards/accuracies": 1.0, "rewards/chosen": 1.6012226343154907, "rewards/margins": 0.7309750914573669, "rewards/rejected": 0.8702475428581238, "step": 2353 }, { "epoch": 1.27, "learning_rate": 7.986247074572457e-08, "logits/chosen": -1.9792016744613647, "logits/rejected": -1.9762136936187744, "logps/chosen": -3.483107566833496, "logps/rejected": -1.5079916715621948, "loss": 0.6397, "rewards/accuracies": 1.0, "rewards/chosen": 1.1795986890792847, "rewards/margins": 0.1099097728729248, "rewards/rejected": 1.0696889162063599, "step": 2354 }, { "epoch": 1.27, "learning_rate": 7.984495273132794e-08, "logits/chosen": -2.2010982036590576, "logits/rejected": -2.0296833515167236, "logps/chosen": -38.311710357666016, "logps/rejected": -3.3319132328033447, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 2.539083957672119, "rewards/margins": 2.031808853149414, "rewards/rejected": 0.5072751045227051, "step": 2355 }, { "epoch": 1.27, "learning_rate": 7.9827429023809e-08, "logits/chosen": -1.8968247175216675, "logits/rejected": -1.9039311408996582, "logps/chosen": -2.3376786708831787, "logps/rejected": -5.12386417388916, "loss": 0.3907, "rewards/accuracies": 1.0, "rewards/chosen": 1.2396366596221924, "rewards/margins": 0.7380961775779724, "rewards/rejected": 0.50154048204422, "step": 2356 }, { "epoch": 1.27, "learning_rate": 7.980989962651053e-08, "logits/chosen": -2.0746564865112305, "logits/rejected": -2.089055299758911, "logps/chosen": -8.398616790771484, "logps/rejected": -5.328930377960205, "loss": 0.426, "rewards/accuracies": 1.0, "rewards/chosen": 1.5196688175201416, "rewards/margins": 0.63277667760849, "rewards/rejected": 0.8868921399116516, "step": 2357 }, { "epoch": 1.27, "learning_rate": 7.979236454277636e-08, "logits/chosen": -2.067215919494629, "logits/rejected": -2.272507905960083, "logps/chosen": -3.9599506855010986, "logps/rejected": -2.605607509613037, "loss": 0.7784, "rewards/accuracies": 0.0, "rewards/chosen": 0.9353457689285278, "rewards/margins": -0.1637434959411621, "rewards/rejected": 1.09908926486969, "step": 2358 }, { "epoch": 1.27, "learning_rate": 7.977482377595146e-08, "logits/chosen": -2.0133345127105713, "logits/rejected": -2.0237815380096436, "logps/chosen": -1.5624994039535522, "logps/rejected": -3.7099196910858154, "loss": 0.3187, "rewards/accuracies": 1.0, "rewards/chosen": 1.5124375820159912, "rewards/margins": 0.979910671710968, "rewards/rejected": 0.5325269103050232, "step": 2359 }, { "epoch": 1.27, "learning_rate": 7.975727732938181e-08, "logits/chosen": -2.1284308433532715, "logits/rejected": -2.1362593173980713, "logps/chosen": -2.127086639404297, "logps/rejected": -3.855020046234131, "loss": 0.4177, "rewards/accuracies": 1.0, "rewards/chosen": 1.3025974035263062, "rewards/margins": 0.6569461226463318, "rewards/rejected": 0.6456512808799744, "step": 2360 }, { "epoch": 1.27, "learning_rate": 7.973972520641454e-08, "logits/chosen": -2.032341241836548, "logits/rejected": -2.2252817153930664, "logps/chosen": -0.853044867515564, "logps/rejected": -0.7807469367980957, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9388473629951477, "rewards/margins": 0.023406386375427246, "rewards/rejected": 0.9154409766197205, "step": 2361 }, { "epoch": 1.27, "learning_rate": 7.97221674103978e-08, "logits/chosen": -2.0928938388824463, "logits/rejected": -2.3501474857330322, "logps/chosen": -1.4556618928909302, "logps/rejected": -1.487011432647705, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 1.0657867193222046, "rewards/margins": 0.004075765609741211, "rewards/rejected": 1.0617109537124634, "step": 2362 }, { "epoch": 1.27, "learning_rate": 7.970460394468088e-08, "logits/chosen": -2.1401772499084473, "logits/rejected": -2.242213726043701, "logps/chosen": -2.3873586654663086, "logps/rejected": -0.8653796315193176, "loss": 0.6555, "rewards/accuracies": 1.0, "rewards/chosen": 0.9442138075828552, "rewards/margins": 0.07669252157211304, "rewards/rejected": 0.8675212860107422, "step": 2363 }, { "epoch": 1.28, "learning_rate": 7.968703481261413e-08, "logits/chosen": -2.0744731426239014, "logits/rejected": -2.2572574615478516, "logps/chosen": -0.5337416529655457, "logps/rejected": -0.5146282911300659, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.73787921667099, "rewards/margins": 0.018101215362548828, "rewards/rejected": 0.7197780013084412, "step": 2364 }, { "epoch": 1.28, "learning_rate": 7.966946001754897e-08, "logits/chosen": -2.1232409477233887, "logits/rejected": -2.24904465675354, "logps/chosen": -8.004688262939453, "logps/rejected": -8.106623649597168, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 1.0975401401519775, "rewards/margins": 0.01975226402282715, "rewards/rejected": 1.0777878761291504, "step": 2365 }, { "epoch": 1.28, "learning_rate": 7.965187956283791e-08, "logits/chosen": -2.239933729171753, "logits/rejected": -2.3135857582092285, "logps/chosen": -2.347748041152954, "logps/rejected": -2.505002737045288, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.7643098831176758, "rewards/margins": 0.012652397155761719, "rewards/rejected": 0.7516574859619141, "step": 2366 }, { "epoch": 1.28, "learning_rate": 7.963429345183453e-08, "logits/chosen": -2.0881187915802, "logits/rejected": -2.0972862243652344, "logps/chosen": -2.0576813220977783, "logps/rejected": -2.365410566329956, "loss": 0.4882, "rewards/accuracies": 1.0, "rewards/chosen": 1.1874266862869263, "rewards/margins": 0.46312248706817627, "rewards/rejected": 0.72430419921875, "step": 2367 }, { "epoch": 1.28, "learning_rate": 7.961670168789351e-08, "logits/chosen": -2.097137928009033, "logits/rejected": -2.253634452819824, "logps/chosen": -0.4948478043079376, "logps/rejected": -0.5181864500045776, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": 0.8322159647941589, "rewards/margins": -0.006199181079864502, "rewards/rejected": 0.8384151458740234, "step": 2368 }, { "epoch": 1.28, "learning_rate": 7.959910427437059e-08, "logits/chosen": -1.9793249368667603, "logits/rejected": -1.9841688871383667, "logps/chosen": -1.1760245561599731, "logps/rejected": -4.3645548820495605, "loss": 0.4876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0450325012207031, "rewards/margins": 0.4646797776222229, "rewards/rejected": 0.5803527235984802, "step": 2369 }, { "epoch": 1.28, "learning_rate": 7.958150121462259e-08, "logits/chosen": -2.1734588146209717, "logits/rejected": -2.2410194873809814, "logps/chosen": -0.5036134719848633, "logps/rejected": -0.5265307426452637, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 1.0365650653839111, "rewards/margins": 0.009687066078186035, "rewards/rejected": 1.026877999305725, "step": 2370 }, { "epoch": 1.28, "learning_rate": 7.956389251200741e-08, "logits/chosen": -1.9763824939727783, "logits/rejected": -1.9815596342086792, "logps/chosen": -5.196170330047607, "logps/rejected": -1.238969326019287, "loss": 0.2298, "rewards/accuracies": 1.0, "rewards/chosen": 1.907434105873108, "rewards/margins": 1.3534834384918213, "rewards/rejected": 0.5539506673812866, "step": 2371 }, { "epoch": 1.28, "learning_rate": 7.954627816988404e-08, "logits/chosen": -2.0130457878112793, "logits/rejected": -2.2639293670654297, "logps/chosen": -0.31921494007110596, "logps/rejected": -0.40579357743263245, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.9774588942527771, "rewards/margins": 0.007130563259124756, "rewards/rejected": 0.9703283309936523, "step": 2372 }, { "epoch": 1.28, "learning_rate": 7.952865819161252e-08, "logits/chosen": -2.127971649169922, "logits/rejected": -2.2929301261901855, "logps/chosen": -0.6512770652770996, "logps/rejected": -0.6617811918258667, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.8491945266723633, "rewards/margins": 0.027552783489227295, "rewards/rejected": 0.821641743183136, "step": 2373 }, { "epoch": 1.28, "learning_rate": 7.951103258055397e-08, "logits/chosen": -2.0004968643188477, "logits/rejected": -2.2592713832855225, "logps/chosen": -0.5272276997566223, "logps/rejected": -0.5525691509246826, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 1.0089770555496216, "rewards/margins": 0.00714111328125, "rewards/rejected": 1.0018359422683716, "step": 2374 }, { "epoch": 1.28, "learning_rate": 7.949340134007061e-08, "logits/chosen": -1.9814764261245728, "logits/rejected": -1.981584906578064, "logps/chosen": -1.1540684700012207, "logps/rejected": -1.5133693218231201, "loss": 0.5789, "rewards/accuracies": 1.0, "rewards/chosen": 0.9606949090957642, "rewards/margins": 0.24326181411743164, "rewards/rejected": 0.7174330949783325, "step": 2375 }, { "epoch": 1.28, "learning_rate": 7.94757644735257e-08, "logits/chosen": -2.134122133255005, "logits/rejected": -2.1849782466888428, "logps/chosen": -12.552767753601074, "logps/rejected": -21.152982711791992, "loss": 0.37, "rewards/accuracies": 1.0, "rewards/chosen": 1.5809438228607178, "rewards/margins": 0.803559422492981, "rewards/rejected": 0.7773844003677368, "step": 2376 }, { "epoch": 1.28, "learning_rate": 7.94581219842836e-08, "logits/chosen": -2.2144711017608643, "logits/rejected": -2.0419132709503174, "logps/chosen": -51.72541427612305, "logps/rejected": -11.757932662963867, "loss": 0.2353, "rewards/accuracies": 1.0, "rewards/chosen": 2.2338244915008545, "rewards/margins": 1.3270461559295654, "rewards/rejected": 0.9067783355712891, "step": 2377 }, { "epoch": 1.28, "learning_rate": 7.944047387570973e-08, "logits/chosen": -2.0974788665771484, "logits/rejected": -2.063283920288086, "logps/chosen": -5.622876167297363, "logps/rejected": -5.33968448638916, "loss": 0.4057, "rewards/accuracies": 1.0, "rewards/chosen": 1.1717044115066528, "rewards/margins": 0.6923246383666992, "rewards/rejected": 0.4793797433376312, "step": 2378 }, { "epoch": 1.28, "learning_rate": 7.942282015117058e-08, "logits/chosen": -2.0800983905792236, "logits/rejected": -2.2675554752349854, "logps/chosen": -2.1749656200408936, "logps/rejected": -3.180760145187378, "loss": 0.654, "rewards/accuracies": 1.0, "rewards/chosen": 0.94617760181427, "rewards/margins": 0.07988089323043823, "rewards/rejected": 0.8662967085838318, "step": 2379 }, { "epoch": 1.28, "learning_rate": 7.940516081403372e-08, "logits/chosen": -2.042248487472534, "logits/rejected": -2.0372323989868164, "logps/chosen": -1.1532437801361084, "logps/rejected": -5.336644172668457, "loss": 0.3503, "rewards/accuracies": 1.0, "rewards/chosen": 1.3050395250320435, "rewards/margins": 0.8687090873718262, "rewards/rejected": 0.4363304078578949, "step": 2380 }, { "epoch": 1.28, "learning_rate": 7.938749586766777e-08, "logits/chosen": -2.0274548530578613, "logits/rejected": -2.030581474304199, "logps/chosen": -1.4619377851486206, "logps/rejected": -2.1124629974365234, "loss": 0.5023, "rewards/accuracies": 1.0, "rewards/chosen": 1.080795168876648, "rewards/margins": 0.42699944972991943, "rewards/rejected": 0.6537957191467285, "step": 2381 }, { "epoch": 1.28, "learning_rate": 7.936982531544247e-08, "logits/chosen": -2.0888447761535645, "logits/rejected": -2.0860235691070557, "logps/chosen": -8.10004997253418, "logps/rejected": -4.19951057434082, "loss": 0.2886, "rewards/accuracies": 1.0, "rewards/chosen": 1.5930891036987305, "rewards/margins": 1.0950934886932373, "rewards/rejected": 0.4979955852031708, "step": 2382 }, { "epoch": 1.29, "learning_rate": 7.935214916072856e-08, "logits/chosen": -2.224900722503662, "logits/rejected": -2.4021658897399902, "logps/chosen": -11.326075553894043, "logps/rejected": -27.249134063720703, "loss": 0.6482, "rewards/accuracies": 1.0, "rewards/chosen": 1.0346646308898926, "rewards/margins": 0.09190565347671509, "rewards/rejected": 0.9427589774131775, "step": 2383 }, { "epoch": 1.29, "learning_rate": 7.933446740689789e-08, "logits/chosen": -2.0441219806671143, "logits/rejected": -2.2938127517700195, "logps/chosen": -0.4357142150402069, "logps/rejected": -0.5590142011642456, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 0.7786128520965576, "rewards/margins": -0.010256171226501465, "rewards/rejected": 0.7888690233230591, "step": 2384 }, { "epoch": 1.29, "learning_rate": 7.931678005732338e-08, "logits/chosen": -1.9854834079742432, "logits/rejected": -1.9917505979537964, "logps/chosen": -1.3604904413223267, "logps/rejected": -4.65179967880249, "loss": 0.4332, "rewards/accuracies": 1.0, "rewards/chosen": 1.0173046588897705, "rewards/margins": 0.6120721101760864, "rewards/rejected": 0.40523257851600647, "step": 2385 }, { "epoch": 1.29, "learning_rate": 7.9299087115379e-08, "logits/chosen": -2.0281598567962646, "logits/rejected": -2.0182297229766846, "logps/chosen": -4.313006401062012, "logps/rejected": -2.238699197769165, "loss": 0.3774, "rewards/accuracies": 1.0, "rewards/chosen": 1.6336761713027954, "rewards/margins": 0.7799215316772461, "rewards/rejected": 0.8537546396255493, "step": 2386 }, { "epoch": 1.29, "learning_rate": 7.928138858443983e-08, "logits/chosen": -2.026409149169922, "logits/rejected": -2.0283737182617188, "logps/chosen": -2.667982578277588, "logps/rejected": -0.8631764054298401, "loss": 0.6269, "rewards/accuracies": 1.0, "rewards/chosen": 1.0907350778579712, "rewards/margins": 0.13713020086288452, "rewards/rejected": 0.9536048769950867, "step": 2387 }, { "epoch": 1.29, "learning_rate": 7.926368446788192e-08, "logits/chosen": -2.0788004398345947, "logits/rejected": -2.2739901542663574, "logps/chosen": -1.3853222131729126, "logps/rejected": -1.5067660808563232, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 0.6062566041946411, "rewards/margins": 0.03374522924423218, "rewards/rejected": 0.5725113749504089, "step": 2388 }, { "epoch": 1.29, "learning_rate": 7.92459747690825e-08, "logits/chosen": -2.0658648014068604, "logits/rejected": -2.0653321743011475, "logps/chosen": -1.0356214046478271, "logps/rejected": -1.5550830364227295, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.900189220905304, "rewards/margins": 0.017220795154571533, "rewards/rejected": 0.8829684257507324, "step": 2389 }, { "epoch": 1.29, "learning_rate": 7.92282594914198e-08, "logits/chosen": -2.0358893871307373, "logits/rejected": -2.283076524734497, "logps/chosen": -0.6073817014694214, "logps/rejected": -0.6028456091880798, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.8307958841323853, "rewards/margins": 0.017300724983215332, "rewards/rejected": 0.8134951591491699, "step": 2390 }, { "epoch": 1.29, "learning_rate": 7.92105386382731e-08, "logits/chosen": -2.1219353675842285, "logits/rejected": -2.1215970516204834, "logps/chosen": -2.3197402954101562, "logps/rejected": -1.7751398086547852, "loss": 0.4841, "rewards/accuracies": 1.0, "rewards/chosen": 1.110729694366455, "rewards/margins": 0.47378164529800415, "rewards/rejected": 0.6369480490684509, "step": 2391 }, { "epoch": 1.29, "learning_rate": 7.919281221302278e-08, "logits/chosen": -2.08573579788208, "logits/rejected": -2.2287678718566895, "logps/chosen": -0.62065190076828, "logps/rejected": -0.6259849071502686, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9333047866821289, "rewards/margins": 0.01801919937133789, "rewards/rejected": 0.915285587310791, "step": 2392 }, { "epoch": 1.29, "learning_rate": 7.917508021905032e-08, "logits/chosen": -2.0221822261810303, "logits/rejected": -2.2350032329559326, "logps/chosen": -0.4993778467178345, "logps/rejected": -0.5303378105163574, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9638986587524414, "rewards/margins": 0.01008373498916626, "rewards/rejected": 0.9538149237632751, "step": 2393 }, { "epoch": 1.29, "learning_rate": 7.915734265973817e-08, "logits/chosen": -2.012887954711914, "logits/rejected": -2.0493931770324707, "logps/chosen": -1.80849289894104, "logps/rejected": -9.830361366271973, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": 1.4919248819351196, "rewards/margins": 1.0173075199127197, "rewards/rejected": 0.4746173024177551, "step": 2394 }, { "epoch": 1.29, "learning_rate": 7.913959953846988e-08, "logits/chosen": -2.0254788398742676, "logits/rejected": -2.0289807319641113, "logps/chosen": -0.5236905217170715, "logps/rejected": -3.7765297889709473, "loss": 0.4819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8833681344985962, "rewards/margins": 0.47933289408683777, "rewards/rejected": 0.4040352404117584, "step": 2395 }, { "epoch": 1.29, "learning_rate": 7.912185085863008e-08, "logits/chosen": -2.0941481590270996, "logits/rejected": -2.0960659980773926, "logps/chosen": -0.3941177725791931, "logps/rejected": -4.692505836486816, "loss": 0.4419, "rewards/accuracies": 1.0, "rewards/chosen": 0.97728031873703, "rewards/margins": 0.5874971151351929, "rewards/rejected": 0.38978320360183716, "step": 2396 }, { "epoch": 1.29, "learning_rate": 7.910409662360447e-08, "logits/chosen": -2.0908401012420654, "logits/rejected": -2.2802562713623047, "logps/chosen": -11.405250549316406, "logps/rejected": -0.49541744589805603, "loss": 0.8405, "rewards/accuracies": 0.0, "rewards/chosen": 0.6452207565307617, "rewards/margins": -0.2757914662361145, "rewards/rejected": 0.9210122227668762, "step": 2397 }, { "epoch": 1.29, "learning_rate": 7.908633683677976e-08, "logits/chosen": -1.9769001007080078, "logits/rejected": -2.224442958831787, "logps/chosen": -1.3971834182739258, "logps/rejected": -1.416837453842163, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 0.8084284663200378, "rewards/margins": -0.006635904312133789, "rewards/rejected": 0.8150643706321716, "step": 2398 }, { "epoch": 1.29, "learning_rate": 7.906857150154376e-08, "logits/chosen": -2.098719596862793, "logits/rejected": -2.3182098865509033, "logps/chosen": -2.3357961177825928, "logps/rejected": -1.2466013431549072, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.9943681955337524, "rewards/margins": 0.02684241533279419, "rewards/rejected": 0.9675257802009583, "step": 2399 }, { "epoch": 1.29, "learning_rate": 7.905080062128531e-08, "logits/chosen": -2.025956153869629, "logits/rejected": -2.016012191772461, "logps/chosen": -6.313979148864746, "logps/rejected": -5.305668354034424, "loss": 0.4546, "rewards/accuracies": 1.0, "rewards/chosen": 1.1133313179016113, "rewards/margins": 0.5523072481155396, "rewards/rejected": 0.5610240697860718, "step": 2400 }, { "epoch": 1.3, "learning_rate": 7.903302419939435e-08, "logits/chosen": -2.011777400970459, "logits/rejected": -2.0119638442993164, "logps/chosen": -1.0239908695220947, "logps/rejected": -3.3491477966308594, "loss": 0.5418, "rewards/accuracies": 1.0, "rewards/chosen": 1.1211633682250977, "rewards/margins": 0.32977473735809326, "rewards/rejected": 0.7913886308670044, "step": 2401 }, { "epoch": 1.3, "learning_rate": 7.901524223926182e-08, "logits/chosen": -2.124986171722412, "logits/rejected": -2.1286988258361816, "logps/chosen": -2.495838165283203, "logps/rejected": -14.513545989990234, "loss": 0.2536, "rewards/accuracies": 1.0, "rewards/chosen": 1.5340427160263062, "rewards/margins": 1.2427390813827515, "rewards/rejected": 0.2913036346435547, "step": 2402 }, { "epoch": 1.3, "learning_rate": 7.899745474427977e-08, "logits/chosen": -2.073887825012207, "logits/rejected": -2.0626680850982666, "logps/chosen": -1.2449207305908203, "logps/rejected": -10.559076309204102, "loss": 0.5683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9999794363975525, "rewards/margins": 0.26762908697128296, "rewards/rejected": 0.7323503494262695, "step": 2403 }, { "epoch": 1.3, "learning_rate": 7.897966171784127e-08, "logits/chosen": -2.052215099334717, "logits/rejected": -2.0583841800689697, "logps/chosen": -3.015052318572998, "logps/rejected": -12.660542488098145, "loss": 0.3866, "rewards/accuracies": 1.0, "rewards/chosen": 1.060518503189087, "rewards/margins": 0.7509195804595947, "rewards/rejected": 0.3095989227294922, "step": 2404 }, { "epoch": 1.3, "learning_rate": 7.896186316334045e-08, "logits/chosen": -2.028613328933716, "logits/rejected": -2.257676839828491, "logps/chosen": -0.4258984923362732, "logps/rejected": -0.42878225445747375, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.8666462898254395, "rewards/margins": 0.003543376922607422, "rewards/rejected": 0.863102912902832, "step": 2405 }, { "epoch": 1.3, "learning_rate": 7.894405908417255e-08, "logits/chosen": -2.075190782546997, "logits/rejected": -2.07200026512146, "logps/chosen": -5.602145671844482, "logps/rejected": -5.210622310638428, "loss": 0.3037, "rewards/accuracies": 1.0, "rewards/chosen": 1.6597931385040283, "rewards/margins": 1.036072015762329, "rewards/rejected": 0.623721182346344, "step": 2406 }, { "epoch": 1.3, "learning_rate": 7.892624948373374e-08, "logits/chosen": -2.0703330039978027, "logits/rejected": -2.0426228046417236, "logps/chosen": -6.070620536804199, "logps/rejected": -3.9108965396881104, "loss": 0.4655, "rewards/accuracies": 1.0, "rewards/chosen": 1.1913713216781616, "rewards/margins": 0.5228856801986694, "rewards/rejected": 0.6684856414794922, "step": 2407 }, { "epoch": 1.3, "learning_rate": 7.890843436542138e-08, "logits/chosen": -2.0183441638946533, "logits/rejected": -2.019569158554077, "logps/chosen": -0.40659549832344055, "logps/rejected": -3.003509283065796, "loss": 0.5409, "rewards/accuracies": 1.0, "rewards/chosen": 1.024418830871582, "rewards/margins": 0.3319705128669739, "rewards/rejected": 0.6924483180046082, "step": 2408 }, { "epoch": 1.3, "learning_rate": 7.889061373263378e-08, "logits/chosen": -2.010565757751465, "logits/rejected": -2.011162519454956, "logps/chosen": -1.5570976734161377, "logps/rejected": -6.078190803527832, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 0.8903285264968872, "rewards/margins": 0.4240076243877411, "rewards/rejected": 0.4663209021091461, "step": 2409 }, { "epoch": 1.3, "learning_rate": 7.887278758877038e-08, "logits/chosen": -2.0752456188201904, "logits/rejected": -2.13810396194458, "logps/chosen": -3.4192535877227783, "logps/rejected": -15.115301132202148, "loss": 0.3535, "rewards/accuracies": 1.0, "rewards/chosen": 1.5860753059387207, "rewards/margins": 0.857938826084137, "rewards/rejected": 0.7281364798545837, "step": 2410 }, { "epoch": 1.3, "learning_rate": 7.885495593723162e-08, "logits/chosen": -1.9998698234558105, "logits/rejected": -1.9690349102020264, "logps/chosen": -15.187686920166016, "logps/rejected": -2.380120038986206, "loss": 0.3808, "rewards/accuracies": 1.0, "rewards/chosen": 1.5675255060195923, "rewards/margins": 0.7690106630325317, "rewards/rejected": 0.7985148429870605, "step": 2411 }, { "epoch": 1.3, "learning_rate": 7.8837118781419e-08, "logits/chosen": -2.0043866634368896, "logits/rejected": -2.0880558490753174, "logps/chosen": -2.587742567062378, "logps/rejected": -21.476619720458984, "loss": 0.6288, "rewards/accuracies": 1.0, "rewards/chosen": 0.8156123161315918, "rewards/margins": 0.13305461406707764, "rewards/rejected": 0.6825577020645142, "step": 2412 }, { "epoch": 1.3, "learning_rate": 7.881927612473509e-08, "logits/chosen": -2.042501211166382, "logits/rejected": -2.2752368450164795, "logps/chosen": -0.48100340366363525, "logps/rejected": -0.4794255197048187, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.7747284770011902, "rewards/margins": 0.007588565349578857, "rewards/rejected": 0.7671399116516113, "step": 2413 }, { "epoch": 1.3, "learning_rate": 7.880142797058347e-08, "logits/chosen": -2.196955680847168, "logits/rejected": -2.2024059295654297, "logps/chosen": -1.682149052619934, "logps/rejected": -4.129843235015869, "loss": 0.4706, "rewards/accuracies": 1.0, "rewards/chosen": 0.9875936508178711, "rewards/margins": 0.5091313123703003, "rewards/rejected": 0.4784623682498932, "step": 2414 }, { "epoch": 1.3, "learning_rate": 7.878357432236879e-08, "logits/chosen": -1.965495228767395, "logits/rejected": -1.9614312648773193, "logps/chosen": -7.5810089111328125, "logps/rejected": -3.6402108669281006, "loss": 0.3024, "rewards/accuracies": 1.0, "rewards/chosen": 1.6368716955184937, "rewards/margins": 1.0411361455917358, "rewards/rejected": 0.5957355499267578, "step": 2415 }, { "epoch": 1.3, "learning_rate": 7.87657151834968e-08, "logits/chosen": -2.0035295486450195, "logits/rejected": -2.325880765914917, "logps/chosen": -6.0880889892578125, "logps/rejected": -2.201831102371216, "loss": 0.7659, "rewards/accuracies": 0.0, "rewards/chosen": 0.8369885683059692, "rewards/margins": -0.1405053734779358, "rewards/rejected": 0.977493941783905, "step": 2416 }, { "epoch": 1.3, "learning_rate": 7.874785055737421e-08, "logits/chosen": -2.0195114612579346, "logits/rejected": -2.0129237174987793, "logps/chosen": -2.635848045349121, "logps/rejected": -5.643309593200684, "loss": 0.3586, "rewards/accuracies": 1.0, "rewards/chosen": 1.2426905632019043, "rewards/margins": 0.8409034013748169, "rewards/rejected": 0.4017871916294098, "step": 2417 }, { "epoch": 1.3, "learning_rate": 7.872998044740884e-08, "logits/chosen": -2.1084342002868652, "logits/rejected": -2.109405755996704, "logps/chosen": -2.093266725540161, "logps/rejected": -1.6831161975860596, "loss": 0.6094, "rewards/accuracies": 1.0, "rewards/chosen": 1.1139830350875854, "rewards/margins": 0.17513293027877808, "rewards/rejected": 0.9388501048088074, "step": 2418 }, { "epoch": 1.3, "learning_rate": 7.87121048570095e-08, "logits/chosen": -2.1214182376861572, "logits/rejected": -2.290736675262451, "logps/chosen": -3.917423963546753, "logps/rejected": -4.415179252624512, "loss": 0.82, "rewards/accuracies": 0.0, "rewards/chosen": 0.8241873979568481, "rewards/margins": -0.23947465419769287, "rewards/rejected": 1.063662052154541, "step": 2419 }, { "epoch": 1.31, "learning_rate": 7.869422378958609e-08, "logits/chosen": -2.1014468669891357, "logits/rejected": -2.0682003498077393, "logps/chosen": -3.1419763565063477, "logps/rejected": -3.2671873569488525, "loss": 0.4563, "rewards/accuracies": 1.0, "rewards/chosen": 1.1267459392547607, "rewards/margins": 0.5478904247283936, "rewards/rejected": 0.5788555145263672, "step": 2420 }, { "epoch": 1.31, "learning_rate": 7.867633724854955e-08, "logits/chosen": -2.065547466278076, "logits/rejected": -2.070650100708008, "logps/chosen": -1.427086353302002, "logps/rejected": -3.1943471431732178, "loss": 0.4235, "rewards/accuracies": 1.0, "rewards/chosen": 1.1471041440963745, "rewards/margins": 0.6398611664772034, "rewards/rejected": 0.5072429776191711, "step": 2421 }, { "epoch": 1.31, "learning_rate": 7.865844523731186e-08, "logits/chosen": -2.162750244140625, "logits/rejected": -2.139662504196167, "logps/chosen": -9.94830322265625, "logps/rejected": -7.0743408203125, "loss": 0.2949, "rewards/accuracies": 1.0, "rewards/chosen": 1.6861851215362549, "rewards/margins": 1.0698645114898682, "rewards/rejected": 0.6163206100463867, "step": 2422 }, { "epoch": 1.31, "learning_rate": 7.864054775928602e-08, "logits/chosen": -2.004587411880493, "logits/rejected": -2.2128047943115234, "logps/chosen": -1.084360122680664, "logps/rejected": -1.0667800903320312, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.8277561068534851, "rewards/margins": -0.002244710922241211, "rewards/rejected": 0.8300008177757263, "step": 2423 }, { "epoch": 1.31, "learning_rate": 7.86226448178861e-08, "logits/chosen": -2.1568610668182373, "logits/rejected": -2.3099513053894043, "logps/chosen": -2.2887930870056152, "logps/rejected": -2.2752368450164795, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.9613895416259766, "rewards/margins": 0.020832538604736328, "rewards/rejected": 0.9405570030212402, "step": 2424 }, { "epoch": 1.31, "learning_rate": 7.860473641652722e-08, "logits/chosen": -2.0280377864837646, "logits/rejected": -2.0269174575805664, "logps/chosen": -2.4475889205932617, "logps/rejected": -4.748992443084717, "loss": 0.611, "rewards/accuracies": 1.0, "rewards/chosen": 1.0828200578689575, "rewards/margins": 0.1716008186340332, "rewards/rejected": 0.9112192392349243, "step": 2425 }, { "epoch": 1.31, "learning_rate": 7.85868225586255e-08, "logits/chosen": -2.0589778423309326, "logits/rejected": -2.0504353046417236, "logps/chosen": -7.95045804977417, "logps/rejected": -0.7906507253646851, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": 1.4901437759399414, "rewards/margins": 0.4636819362640381, "rewards/rejected": 1.0264618396759033, "step": 2426 }, { "epoch": 1.31, "learning_rate": 7.856890324759815e-08, "logits/chosen": -2.053550958633423, "logits/rejected": -2.0957579612731934, "logps/chosen": -3.6085360050201416, "logps/rejected": -9.516319274902344, "loss": 0.3692, "rewards/accuracies": 1.0, "rewards/chosen": 1.479034662246704, "rewards/margins": 0.8060324192047119, "rewards/rejected": 0.6730022430419922, "step": 2427 }, { "epoch": 1.31, "learning_rate": 7.855097848686341e-08, "logits/chosen": -2.0597870349884033, "logits/rejected": -2.0541629791259766, "logps/chosen": -8.425857543945312, "logps/rejected": -8.622635841369629, "loss": 0.3023, "rewards/accuracies": 1.0, "rewards/chosen": 1.492012858390808, "rewards/margins": 1.0415679216384888, "rewards/rejected": 0.45044490694999695, "step": 2428 }, { "epoch": 1.31, "learning_rate": 7.85330482798405e-08, "logits/chosen": -2.1099517345428467, "logits/rejected": -2.2506847381591797, "logps/chosen": -0.6249048113822937, "logps/rejected": -0.564380407333374, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.9880668520927429, "rewards/margins": 0.02689075469970703, "rewards/rejected": 0.9611760973930359, "step": 2429 }, { "epoch": 1.31, "learning_rate": 7.851511262994975e-08, "logits/chosen": -2.0940489768981934, "logits/rejected": -2.3186819553375244, "logps/chosen": -1.2634062767028809, "logps/rejected": -1.178725242614746, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 1.1650539636611938, "rewards/margins": 0.03127789497375488, "rewards/rejected": 1.133776068687439, "step": 2430 }, { "epoch": 1.31, "learning_rate": 7.849717154061251e-08, "logits/chosen": -2.0435595512390137, "logits/rejected": -2.2388079166412354, "logps/chosen": -0.5482994318008423, "logps/rejected": -0.6176755428314209, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.8957242369651794, "rewards/margins": -0.0081709623336792, "rewards/rejected": 0.9038951992988586, "step": 2431 }, { "epoch": 1.31, "learning_rate": 7.847922501525115e-08, "logits/chosen": -1.9733303785324097, "logits/rejected": -2.305950403213501, "logps/chosen": -3.262162446975708, "logps/rejected": -6.121267795562744, "loss": 0.5907, "rewards/accuracies": 1.0, "rewards/chosen": 0.7451039552688599, "rewards/margins": 0.21668046712875366, "rewards/rejected": 0.5284234881401062, "step": 2432 }, { "epoch": 1.31, "learning_rate": 7.846127305728911e-08, "logits/chosen": -2.097208023071289, "logits/rejected": -2.096928119659424, "logps/chosen": -4.187158584594727, "logps/rejected": -3.2605199813842773, "loss": 0.4632, "rewards/accuracies": 1.0, "rewards/chosen": 1.1425504684448242, "rewards/margins": 0.5291799306869507, "rewards/rejected": 0.6133705377578735, "step": 2433 }, { "epoch": 1.31, "learning_rate": 7.844331567015082e-08, "logits/chosen": -2.0573675632476807, "logits/rejected": -2.2691760063171387, "logps/chosen": -7.380393028259277, "logps/rejected": -7.985407829284668, "loss": 0.6572, "rewards/accuracies": 1.0, "rewards/chosen": 0.7259644865989685, "rewards/margins": 0.07325422763824463, "rewards/rejected": 0.6527102589607239, "step": 2434 }, { "epoch": 1.31, "learning_rate": 7.842535285726178e-08, "logits/chosen": -1.9669259786605835, "logits/rejected": -2.246277093887329, "logps/chosen": -1.693302035331726, "logps/rejected": -1.782672643661499, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9066009521484375, "rewards/margins": 0.011308848857879639, "rewards/rejected": 0.8952921032905579, "step": 2435 }, { "epoch": 1.31, "learning_rate": 7.840738462204853e-08, "logits/chosen": -2.0764806270599365, "logits/rejected": -2.2294387817382812, "logps/chosen": -0.32851213216781616, "logps/rejected": -0.29061785340309143, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.9005804061889648, "rewards/margins": 0.015322387218475342, "rewards/rejected": 0.8852580189704895, "step": 2436 }, { "epoch": 1.31, "learning_rate": 7.838941096793858e-08, "logits/chosen": -2.183241605758667, "logits/rejected": -2.190610885620117, "logps/chosen": -1.1280224323272705, "logps/rejected": -3.455883026123047, "loss": 0.4638, "rewards/accuracies": 1.0, "rewards/chosen": 0.9523016810417175, "rewards/margins": 0.5274105072021484, "rewards/rejected": 0.4248912036418915, "step": 2437 }, { "epoch": 1.31, "learning_rate": 7.83714318983606e-08, "logits/chosen": -2.047346353530884, "logits/rejected": -2.0544345378875732, "logps/chosen": -2.2223103046417236, "logps/rejected": -4.755077838897705, "loss": 0.4837, "rewards/accuracies": 1.0, "rewards/chosen": 0.991917610168457, "rewards/margins": 0.4747956395149231, "rewards/rejected": 0.5171219706535339, "step": 2438 }, { "epoch": 1.32, "learning_rate": 7.835344741674415e-08, "logits/chosen": -1.9758809804916382, "logits/rejected": -1.9759407043457031, "logps/chosen": -0.4637652039527893, "logps/rejected": -3.5024361610412598, "loss": 0.5389, "rewards/accuracies": 1.0, "rewards/chosen": 0.9850655794143677, "rewards/margins": 0.3367982506752014, "rewards/rejected": 0.6482673287391663, "step": 2439 }, { "epoch": 1.32, "learning_rate": 7.833545752651992e-08, "logits/chosen": -2.054183006286621, "logits/rejected": -2.0452980995178223, "logps/chosen": -4.863564968109131, "logps/rejected": -4.761344909667969, "loss": 0.382, "rewards/accuracies": 1.0, "rewards/chosen": 1.2795170545578003, "rewards/margins": 0.7651351690292358, "rewards/rejected": 0.5143818855285645, "step": 2440 }, { "epoch": 1.32, "learning_rate": 7.83174622311196e-08, "logits/chosen": -2.0602290630340576, "logits/rejected": -2.307663679122925, "logps/chosen": -2.853259801864624, "logps/rejected": -8.974055290222168, "loss": 0.5476, "rewards/accuracies": 1.0, "rewards/chosen": 0.7484545707702637, "rewards/margins": 0.31602048873901367, "rewards/rejected": 0.43243408203125, "step": 2441 }, { "epoch": 1.32, "learning_rate": 7.82994615339759e-08, "logits/chosen": -1.9971823692321777, "logits/rejected": -1.9974483251571655, "logps/chosen": -0.6840327382087708, "logps/rejected": -3.9567790031433105, "loss": 0.4783, "rewards/accuracies": 1.0, "rewards/chosen": 1.0276575088500977, "rewards/margins": 0.48873478174209595, "rewards/rejected": 0.5389227271080017, "step": 2442 }, { "epoch": 1.32, "learning_rate": 7.828145543852258e-08, "logits/chosen": -2.1038503646850586, "logits/rejected": -2.2979929447174072, "logps/chosen": -1.0231150388717651, "logps/rejected": -17.456432342529297, "loss": 0.5372, "rewards/accuracies": 1.0, "rewards/chosen": 1.21639883518219, "rewards/margins": 0.34088003635406494, "rewards/rejected": 0.875518798828125, "step": 2443 }, { "epoch": 1.32, "learning_rate": 7.826344394819442e-08, "logits/chosen": -2.100156784057617, "logits/rejected": -2.268526315689087, "logps/chosen": -0.3532026410102844, "logps/rejected": -0.3889263868331909, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.9900466799736023, "rewards/margins": 0.021413028240203857, "rewards/rejected": 0.9686336517333984, "step": 2444 }, { "epoch": 1.32, "learning_rate": 7.824542706642723e-08, "logits/chosen": -2.0811986923217773, "logits/rejected": -2.290231704711914, "logps/chosen": -0.3774813413619995, "logps/rejected": -0.3997916281223297, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.8754798769950867, "rewards/margins": 0.028798282146453857, "rewards/rejected": 0.8466815948486328, "step": 2445 }, { "epoch": 1.32, "learning_rate": 7.822740479665786e-08, "logits/chosen": -2.1371774673461914, "logits/rejected": -2.147188663482666, "logps/chosen": -2.0589985847473145, "logps/rejected": -4.372230529785156, "loss": 0.4464, "rewards/accuracies": 1.0, "rewards/chosen": 1.0592561960220337, "rewards/margins": 0.5749161243438721, "rewards/rejected": 0.484340101480484, "step": 2446 }, { "epoch": 1.32, "learning_rate": 7.820937714232417e-08, "logits/chosen": -2.0217196941375732, "logits/rejected": -2.013686418533325, "logps/chosen": -5.297208786010742, "logps/rejected": -7.7924346923828125, "loss": 0.3029, "rewards/accuracies": 1.0, "rewards/chosen": 1.6087522506713867, "rewards/margins": 1.0390361547470093, "rewards/rejected": 0.5697160959243774, "step": 2447 }, { "epoch": 1.32, "learning_rate": 7.819134410686504e-08, "logits/chosen": -2.093588352203369, "logits/rejected": -2.2967278957366943, "logps/chosen": -0.187798410654068, "logps/rejected": -0.20174486935138702, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0352784395217896, "rewards/margins": 0.02352595329284668, "rewards/rejected": 1.0117524862289429, "step": 2448 }, { "epoch": 1.32, "learning_rate": 7.817330569372041e-08, "logits/chosen": -2.0690500736236572, "logits/rejected": -2.282982110977173, "logps/chosen": -7.156559467315674, "logps/rejected": -0.632078230381012, "loss": 0.7519, "rewards/accuracies": 0.0, "rewards/chosen": 0.7765495777130127, "rewards/margins": -0.11417233943939209, "rewards/rejected": 0.8907219171524048, "step": 2449 }, { "epoch": 1.32, "learning_rate": 7.815526190633123e-08, "logits/chosen": -1.9589970111846924, "logits/rejected": -2.2108566761016846, "logps/chosen": -0.7676190137863159, "logps/rejected": -3.0507657527923584, "loss": 0.5804, "rewards/accuracies": 1.0, "rewards/chosen": 1.0277615785598755, "rewards/margins": 0.2397371530532837, "rewards/rejected": 0.7880244255065918, "step": 2450 }, { "epoch": 1.32, "learning_rate": 7.813721274813946e-08, "logits/chosen": -2.025357723236084, "logits/rejected": -2.2200794219970703, "logps/chosen": -0.5373839735984802, "logps/rejected": -0.5449427962303162, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 1.008678913116455, "rewards/margins": 0.0019811391830444336, "rewards/rejected": 1.0066977739334106, "step": 2451 }, { "epoch": 1.32, "learning_rate": 7.81191582225881e-08, "logits/chosen": -2.033259868621826, "logits/rejected": -2.0282347202301025, "logps/chosen": -9.261419296264648, "logps/rejected": -2.0682148933410645, "loss": 0.3722, "rewards/accuracies": 1.0, "rewards/chosen": 1.4569391012191772, "rewards/margins": 0.7964246869087219, "rewards/rejected": 0.6605144143104553, "step": 2452 }, { "epoch": 1.32, "learning_rate": 7.810109833312115e-08, "logits/chosen": -2.0245635509490967, "logits/rejected": -2.1950085163116455, "logps/chosen": -0.7525742650032043, "logps/rejected": -0.7625486850738525, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.8291301131248474, "rewards/margins": 0.027634918689727783, "rewards/rejected": 0.8014951944351196, "step": 2453 }, { "epoch": 1.32, "learning_rate": 7.808303308318369e-08, "logits/chosen": -2.1652324199676514, "logits/rejected": -2.103541612625122, "logps/chosen": -17.647769927978516, "logps/rejected": -10.796220779418945, "loss": 0.1827, "rewards/accuracies": 1.0, "rewards/chosen": 1.9330013990402222, "rewards/margins": 1.6073963642120361, "rewards/rejected": 0.32560500502586365, "step": 2454 }, { "epoch": 1.32, "learning_rate": 7.806496247622176e-08, "logits/chosen": -2.06064772605896, "logits/rejected": -2.273397207260132, "logps/chosen": -3.4069161415100098, "logps/rejected": -8.857623100280762, "loss": 0.7353, "rewards/accuracies": 0.0, "rewards/chosen": 0.7740984559059143, "rewards/margins": -0.08253520727157593, "rewards/rejected": 0.8566336631774902, "step": 2455 }, { "epoch": 1.32, "learning_rate": 7.804688651568245e-08, "logits/chosen": -2.0486690998077393, "logits/rejected": -2.1193127632141113, "logps/chosen": -2.2814793586730957, "logps/rejected": -20.113445281982422, "loss": 0.5163, "rewards/accuracies": 1.0, "rewards/chosen": 1.3513973951339722, "rewards/margins": 0.39184272289276123, "rewards/rejected": 0.9595546722412109, "step": 2456 }, { "epoch": 1.33, "learning_rate": 7.802880520501389e-08, "logits/chosen": -1.997915267944336, "logits/rejected": -2.260982036590576, "logps/chosen": -0.8869257569313049, "logps/rejected": -0.9760480523109436, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 1.0252735614776611, "rewards/margins": 0.020932912826538086, "rewards/rejected": 1.004340648651123, "step": 2457 }, { "epoch": 1.33, "learning_rate": 7.80107185476652e-08, "logits/chosen": -2.032925844192505, "logits/rejected": -2.208940029144287, "logps/chosen": -0.9500894546508789, "logps/rejected": -0.947616696357727, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 1.0222077369689941, "rewards/margins": 0.0122755765914917, "rewards/rejected": 1.0099321603775024, "step": 2458 }, { "epoch": 1.33, "learning_rate": 7.799262654708652e-08, "logits/chosen": -2.0475974082946777, "logits/rejected": -2.038666009902954, "logps/chosen": -9.021930694580078, "logps/rejected": -4.094347953796387, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": 1.4622802734375, "rewards/margins": 0.8827246427536011, "rewards/rejected": 0.5795556306838989, "step": 2459 }, { "epoch": 1.33, "learning_rate": 7.797452920672901e-08, "logits/chosen": -2.0542280673980713, "logits/rejected": -2.00506591796875, "logps/chosen": -14.643798828125, "logps/rejected": -3.4612698554992676, "loss": 0.3305, "rewards/accuracies": 1.0, "rewards/chosen": 1.6842533349990845, "rewards/margins": 0.9372379779815674, "rewards/rejected": 0.7470153570175171, "step": 2460 }, { "epoch": 1.33, "learning_rate": 7.795642653004488e-08, "logits/chosen": -2.2281293869018555, "logits/rejected": -2.1496975421905518, "logps/chosen": -22.61145782470703, "logps/rejected": -6.445071220397949, "loss": 0.4393, "rewards/accuracies": 1.0, "rewards/chosen": 1.3707016706466675, "rewards/margins": 0.5949790477752686, "rewards/rejected": 0.7757226228713989, "step": 2461 }, { "epoch": 1.33, "learning_rate": 7.793831852048733e-08, "logits/chosen": -2.1333017349243164, "logits/rejected": -2.117278575897217, "logps/chosen": -9.351728439331055, "logps/rejected": -4.062657356262207, "loss": 0.3548, "rewards/accuracies": 1.0, "rewards/chosen": 1.4093157052993774, "rewards/margins": 0.8535578846931458, "rewards/rejected": 0.5557578206062317, "step": 2462 }, { "epoch": 1.33, "learning_rate": 7.792020518151057e-08, "logits/chosen": -2.236806631088257, "logits/rejected": -2.2563257217407227, "logps/chosen": -15.040337562561035, "logps/rejected": -17.304763793945312, "loss": 0.5066, "rewards/accuracies": 1.0, "rewards/chosen": 1.5605052709579468, "rewards/margins": 0.41609907150268555, "rewards/rejected": 1.1444061994552612, "step": 2463 }, { "epoch": 1.33, "learning_rate": 7.790208651656987e-08, "logits/chosen": -2.072741985321045, "logits/rejected": -2.085721969604492, "logps/chosen": -5.4952392578125, "logps/rejected": -5.856899738311768, "loss": 0.3085, "rewards/accuracies": 1.0, "rewards/chosen": 1.8390522003173828, "rewards/margins": 1.017669677734375, "rewards/rejected": 0.8213825225830078, "step": 2464 }, { "epoch": 1.33, "learning_rate": 7.788396252912144e-08, "logits/chosen": -2.2021942138671875, "logits/rejected": -2.2023978233337402, "logps/chosen": -0.8187140822410583, "logps/rejected": -4.264066219329834, "loss": 0.4957, "rewards/accuracies": 1.0, "rewards/chosen": 1.0237219333648682, "rewards/margins": 0.4437815546989441, "rewards/rejected": 0.5799403786659241, "step": 2465 }, { "epoch": 1.33, "learning_rate": 7.786583322262259e-08, "logits/chosen": -2.0297305583953857, "logits/rejected": -2.0324769020080566, "logps/chosen": -1.7769408226013184, "logps/rejected": -0.5111045241355896, "loss": 0.631, "rewards/accuracies": 1.0, "rewards/chosen": 0.8599556088447571, "rewards/margins": 0.12844359874725342, "rewards/rejected": 0.7315120100975037, "step": 2466 }, { "epoch": 1.33, "learning_rate": 7.784769860053158e-08, "logits/chosen": -2.007349967956543, "logits/rejected": -2.017346143722534, "logps/chosen": -1.9346678256988525, "logps/rejected": -3.0357301235198975, "loss": 0.3865, "rewards/accuracies": 1.0, "rewards/chosen": 1.4195547103881836, "rewards/margins": 0.7510824799537659, "rewards/rejected": 0.6684722304344177, "step": 2467 }, { "epoch": 1.33, "learning_rate": 7.782955866630772e-08, "logits/chosen": -2.023672342300415, "logits/rejected": -2.2663230895996094, "logps/chosen": -0.7560720443725586, "logps/rejected": -0.737112283706665, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.8797912001609802, "rewards/margins": 0.02838587760925293, "rewards/rejected": 0.8514053225517273, "step": 2468 }, { "epoch": 1.33, "learning_rate": 7.781141342341133e-08, "logits/chosen": -2.1281986236572266, "logits/rejected": -2.1192128658294678, "logps/chosen": -0.5793319344520569, "logps/rejected": -7.119389057159424, "loss": 0.4422, "rewards/accuracies": 1.0, "rewards/chosen": 1.1530874967575073, "rewards/margins": 0.586879551410675, "rewards/rejected": 0.5662079453468323, "step": 2469 }, { "epoch": 1.33, "learning_rate": 7.779326287530373e-08, "logits/chosen": -2.0453977584838867, "logits/rejected": -2.0538530349731445, "logps/chosen": -4.880971908569336, "logps/rejected": -3.7236669063568115, "loss": 0.4344, "rewards/accuracies": 1.0, "rewards/chosen": 1.1172564029693604, "rewards/margins": 0.6086754202842712, "rewards/rejected": 0.5085809826850891, "step": 2470 }, { "epoch": 1.33, "learning_rate": 7.777510702544722e-08, "logits/chosen": -2.1095519065856934, "logits/rejected": -2.0978775024414062, "logps/chosen": -6.837736129760742, "logps/rejected": -4.161092281341553, "loss": 0.3802, "rewards/accuracies": 1.0, "rewards/chosen": 1.2599775791168213, "rewards/margins": 0.7708282470703125, "rewards/rejected": 0.4891493320465088, "step": 2471 }, { "epoch": 1.33, "learning_rate": 7.775694587730523e-08, "logits/chosen": -1.9541051387786865, "logits/rejected": -1.9540094137191772, "logps/chosen": -0.3995574116706848, "logps/rejected": -5.9964399337768555, "loss": 0.4821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9583196640014648, "rewards/margins": 0.4789504110813141, "rewards/rejected": 0.47936925292015076, "step": 2472 }, { "epoch": 1.33, "learning_rate": 7.773877943434204e-08, "logits/chosen": -2.1027796268463135, "logits/rejected": -2.110421657562256, "logps/chosen": -5.219855785369873, "logps/rejected": -2.3562264442443848, "loss": 0.2031, "rewards/accuracies": 1.0, "rewards/chosen": 2.1158509254455566, "rewards/margins": 1.490756630897522, "rewards/rejected": 0.6250942945480347, "step": 2473 }, { "epoch": 1.33, "learning_rate": 7.772060770002306e-08, "logits/chosen": -2.1137571334838867, "logits/rejected": -2.3121986389160156, "logps/chosen": -0.29290351271629333, "logps/rejected": -0.32915225625038147, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.9153891801834106, "rewards/margins": -0.005426943302154541, "rewards/rejected": 0.9208161234855652, "step": 2474 }, { "epoch": 1.33, "learning_rate": 7.770243067781467e-08, "logits/chosen": -2.1403286457061768, "logits/rejected": -2.3162198066711426, "logps/chosen": -1.7025842666625977, "logps/rejected": -4.166622638702393, "loss": 0.7129, "rewards/accuracies": 0.0, "rewards/chosen": 0.8369364142417908, "rewards/margins": -0.039125025272369385, "rewards/rejected": 0.8760614395141602, "step": 2475 }, { "epoch": 1.34, "learning_rate": 7.768424837118423e-08, "logits/chosen": -2.1901488304138184, "logits/rejected": -2.1084303855895996, "logps/chosen": -33.4775390625, "logps/rejected": -1.7065980434417725, "loss": 0.2685, "rewards/accuracies": 1.0, "rewards/chosen": 1.9857505559921265, "rewards/margins": 1.1774497032165527, "rewards/rejected": 0.808300793170929, "step": 2476 }, { "epoch": 1.34, "learning_rate": 7.766606078360017e-08, "logits/chosen": -1.9455053806304932, "logits/rejected": -2.2697701454162598, "logps/chosen": -0.34361177682876587, "logps/rejected": -0.37885183095932007, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 1.0404086112976074, "rewards/margins": -0.0015434026718139648, "rewards/rejected": 1.0419520139694214, "step": 2477 }, { "epoch": 1.34, "learning_rate": 7.764786791853186e-08, "logits/chosen": -2.066115140914917, "logits/rejected": -2.2625908851623535, "logps/chosen": -2.7317235469818115, "logps/rejected": -2.9334795475006104, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 0.6651267409324646, "rewards/margins": -0.015503942966461182, "rewards/rejected": 0.6806306838989258, "step": 2478 }, { "epoch": 1.34, "learning_rate": 7.762966977944973e-08, "logits/chosen": -1.9808282852172852, "logits/rejected": -2.324361801147461, "logps/chosen": -3.9416322708129883, "logps/rejected": -4.109068393707275, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 1.0163172483444214, "rewards/margins": 0.02278846502304077, "rewards/rejected": 0.9935287833213806, "step": 2479 }, { "epoch": 1.34, "learning_rate": 7.761146636982519e-08, "logits/chosen": -2.0235190391540527, "logits/rejected": -2.2579760551452637, "logps/chosen": -0.5330283641815186, "logps/rejected": -0.5401362776756287, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.8616228103637695, "rewards/margins": 0.003280341625213623, "rewards/rejected": 0.8583424687385559, "step": 2480 }, { "epoch": 1.34, "learning_rate": 7.759325769313065e-08, "logits/chosen": -2.1199615001678467, "logits/rejected": -2.127761125564575, "logps/chosen": -1.6447877883911133, "logps/rejected": -4.7973737716674805, "loss": 0.4737, "rewards/accuracies": 1.0, "rewards/chosen": 1.04097580909729, "rewards/margins": 0.5008857846260071, "rewards/rejected": 0.540090024471283, "step": 2481 }, { "epoch": 1.34, "learning_rate": 7.757504375283955e-08, "logits/chosen": -1.9606521129608154, "logits/rejected": -2.2536211013793945, "logps/chosen": -0.853919506072998, "logps/rejected": -0.8623910546302795, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8939937949180603, "rewards/margins": 0.019868195056915283, "rewards/rejected": 0.874125599861145, "step": 2482 }, { "epoch": 1.34, "learning_rate": 7.755682455242632e-08, "logits/chosen": -2.1103570461273193, "logits/rejected": -2.263117551803589, "logps/chosen": -1.5427554845809937, "logps/rejected": -5.211498737335205, "loss": 0.6124, "rewards/accuracies": 1.0, "rewards/chosen": 0.9091209769248962, "rewards/margins": 0.16852426528930664, "rewards/rejected": 0.7405967116355896, "step": 2483 }, { "epoch": 1.34, "learning_rate": 7.753860009536639e-08, "logits/chosen": -2.1239936351776123, "logits/rejected": -2.1317195892333984, "logps/chosen": -4.053481101989746, "logps/rejected": -4.767800331115723, "loss": 0.3885, "rewards/accuracies": 1.0, "rewards/chosen": 1.2639784812927246, "rewards/margins": 0.7449148893356323, "rewards/rejected": 0.5190635919570923, "step": 2484 }, { "epoch": 1.34, "learning_rate": 7.752037038513621e-08, "logits/chosen": -2.0509769916534424, "logits/rejected": -2.053068161010742, "logps/chosen": -1.129826545715332, "logps/rejected": -5.98685884475708, "loss": 0.4049, "rewards/accuracies": 1.0, "rewards/chosen": 1.0329569578170776, "rewards/margins": 0.6949459314346313, "rewards/rejected": 0.3380110263824463, "step": 2485 }, { "epoch": 1.34, "learning_rate": 7.750213542521319e-08, "logits/chosen": -2.1705002784729004, "logits/rejected": -2.176948070526123, "logps/chosen": -3.746570587158203, "logps/rejected": -6.5562744140625, "loss": 0.3667, "rewards/accuracies": 1.0, "rewards/chosen": 1.6658333539962769, "rewards/margins": 0.8142656683921814, "rewards/rejected": 0.8515676856040955, "step": 2486 }, { "epoch": 1.34, "learning_rate": 7.748389521907576e-08, "logits/chosen": -2.0229809284210205, "logits/rejected": -2.268199920654297, "logps/chosen": -0.9419926404953003, "logps/rejected": -0.8856127262115479, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.9885156750679016, "rewards/margins": 0.008759796619415283, "rewards/rejected": 0.9797558784484863, "step": 2487 }, { "epoch": 1.34, "learning_rate": 7.746564977020342e-08, "logits/chosen": -2.108760356903076, "logits/rejected": -2.3753726482391357, "logps/chosen": -10.970169067382812, "logps/rejected": -9.04840087890625, "loss": 0.7413, "rewards/accuracies": 0.0, "rewards/chosen": 0.6608255505561829, "rewards/margins": -0.09400159120559692, "rewards/rejected": 0.7548271417617798, "step": 2488 }, { "epoch": 1.34, "learning_rate": 7.744739908207656e-08, "logits/chosen": -2.1025867462158203, "logits/rejected": -2.171987771987915, "logps/chosen": -15.42514419555664, "logps/rejected": -12.372688293457031, "loss": 0.4943, "rewards/accuracies": 1.0, "rewards/chosen": 1.4585847854614258, "rewards/margins": 0.4473562240600586, "rewards/rejected": 1.0112285614013672, "step": 2489 }, { "epoch": 1.34, "learning_rate": 7.742914315817663e-08, "logits/chosen": -2.017483711242676, "logits/rejected": -2.3017635345458984, "logps/chosen": -0.820893406867981, "logps/rejected": -0.788036584854126, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.9085471034049988, "rewards/margins": 0.042073607444763184, "rewards/rejected": 0.8664734959602356, "step": 2490 }, { "epoch": 1.34, "learning_rate": 7.741088200198607e-08, "logits/chosen": -2.0326614379882812, "logits/rejected": -2.0401697158813477, "logps/chosen": -1.456684947013855, "logps/rejected": -3.443600654602051, "loss": 0.4679, "rewards/accuracies": 1.0, "rewards/chosen": 1.0353795289993286, "rewards/margins": 0.5164042711257935, "rewards/rejected": 0.5189752578735352, "step": 2491 }, { "epoch": 1.34, "learning_rate": 7.73926156169883e-08, "logits/chosen": -2.0356366634368896, "logits/rejected": -2.2333455085754395, "logps/chosen": -1.044406771659851, "logps/rejected": -1.049416184425354, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.785232663154602, "rewards/margins": 0.017520129680633545, "rewards/rejected": 0.7677125334739685, "step": 2492 }, { "epoch": 1.34, "learning_rate": 7.73743440066678e-08, "logits/chosen": -2.073000907897949, "logits/rejected": -2.069600820541382, "logps/chosen": -4.312691688537598, "logps/rejected": -4.285360813140869, "loss": 0.5214, "rewards/accuracies": 1.0, "rewards/chosen": 0.9638744592666626, "rewards/margins": 0.3791871666908264, "rewards/rejected": 0.5846872925758362, "step": 2493 }, { "epoch": 1.35, "learning_rate": 7.735606717450995e-08, "logits/chosen": -2.1089601516723633, "logits/rejected": -2.2843916416168213, "logps/chosen": -3.648483991622925, "logps/rejected": -1.633671760559082, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 1.0796740055084229, "rewards/margins": -0.007810115814208984, "rewards/rejected": 1.0874841213226318, "step": 2494 }, { "epoch": 1.35, "learning_rate": 7.733778512400119e-08, "logits/chosen": -1.9855303764343262, "logits/rejected": -1.9858434200286865, "logps/chosen": -0.42464882135391235, "logps/rejected": -7.5150465965271, "loss": 0.3875, "rewards/accuracies": 1.0, "rewards/chosen": 1.0396279096603394, "rewards/margins": 0.7479842305183411, "rewards/rejected": 0.2916436791419983, "step": 2495 }, { "epoch": 1.35, "learning_rate": 7.731949785862898e-08, "logits/chosen": -2.0595645904541016, "logits/rejected": -2.2472188472747803, "logps/chosen": -0.6138260960578918, "logps/rejected": -0.568439245223999, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.8729414343833923, "rewards/margins": 0.03575223684310913, "rewards/rejected": 0.8371891975402832, "step": 2496 }, { "epoch": 1.35, "learning_rate": 7.73012053818817e-08, "logits/chosen": -2.105569362640381, "logits/rejected": -2.281318426132202, "logps/chosen": -0.3804527521133423, "logps/rejected": -0.4120786786079407, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.8418575525283813, "rewards/margins": 0.013976991176605225, "rewards/rejected": 0.8278805613517761, "step": 2497 }, { "epoch": 1.35, "learning_rate": 7.728290769724876e-08, "logits/chosen": -1.9951937198638916, "logits/rejected": -2.00246262550354, "logps/chosen": -2.337663173675537, "logps/rejected": -2.143775701522827, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": 1.1681671142578125, "rewards/margins": 0.36959344148635864, "rewards/rejected": 0.7985736727714539, "step": 2498 }, { "epoch": 1.35, "learning_rate": 7.726460480822055e-08, "logits/chosen": -1.9661699533462524, "logits/rejected": -1.9602726697921753, "logps/chosen": -6.46708869934082, "logps/rejected": -3.2249996662139893, "loss": 0.309, "rewards/accuracies": 1.0, "rewards/chosen": 1.5645796060562134, "rewards/margins": 1.015761137008667, "rewards/rejected": 0.5488184094429016, "step": 2499 }, { "epoch": 1.35, "learning_rate": 7.724629671828852e-08, "logits/chosen": -2.0953421592712402, "logits/rejected": -2.2818140983581543, "logps/chosen": -0.7199497818946838, "logps/rejected": -5.488489151000977, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.7356657981872559, "rewards/margins": -0.012005269527435303, "rewards/rejected": 0.7476710677146912, "step": 2500 }, { "epoch": 1.35, "learning_rate": 7.722798343094503e-08, "logits/chosen": -2.145080804824829, "logits/rejected": -2.15073561668396, "logps/chosen": -3.443171739578247, "logps/rejected": -3.440983295440674, "loss": 0.5544, "rewards/accuracies": 1.0, "rewards/chosen": 0.9765308499336243, "rewards/margins": 0.29984402656555176, "rewards/rejected": 0.6766868233680725, "step": 2501 }, { "epoch": 1.35, "learning_rate": 7.720966494968345e-08, "logits/chosen": -1.9479178190231323, "logits/rejected": -1.96035897731781, "logps/chosen": -1.9731018543243408, "logps/rejected": -4.471350193023682, "loss": 0.5028, "rewards/accuracies": 1.0, "rewards/chosen": 0.8347504734992981, "rewards/margins": 0.4255381226539612, "rewards/rejected": 0.4092123508453369, "step": 2502 }, { "epoch": 1.35, "learning_rate": 7.719134127799816e-08, "logits/chosen": -2.0922272205352783, "logits/rejected": -2.0920188426971436, "logps/chosen": -2.1821322441101074, "logps/rejected": -1.6642076969146729, "loss": 0.6006, "rewards/accuracies": 1.0, "rewards/chosen": 1.033899188041687, "rewards/margins": 0.19446319341659546, "rewards/rejected": 0.8394359946250916, "step": 2503 }, { "epoch": 1.35, "learning_rate": 7.717301241938452e-08, "logits/chosen": -2.03684139251709, "logits/rejected": -2.218949556350708, "logps/chosen": -0.34671148657798767, "logps/rejected": -0.3582075834274292, "loss": 0.6725, "rewards/accuracies": 1.0, "rewards/chosen": 0.8208990097045898, "rewards/margins": 0.04174518585205078, "rewards/rejected": 0.7791538238525391, "step": 2504 }, { "epoch": 1.35, "learning_rate": 7.715467837733889e-08, "logits/chosen": -2.065349817276001, "logits/rejected": -2.2420241832733154, "logps/chosen": -0.58946692943573, "logps/rejected": -0.5555073618888855, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.8413864374160767, "rewards/margins": 0.029895782470703125, "rewards/rejected": 0.8114906549453735, "step": 2505 }, { "epoch": 1.35, "learning_rate": 7.713633915535861e-08, "logits/chosen": -2.2352144718170166, "logits/rejected": -2.1239421367645264, "logps/chosen": -28.9212646484375, "logps/rejected": -6.138302326202393, "loss": 0.3242, "rewards/accuracies": 1.0, "rewards/chosen": 1.325877070426941, "rewards/margins": 0.9599676132202148, "rewards/rejected": 0.3659094274044037, "step": 2506 }, { "epoch": 1.35, "learning_rate": 7.7117994756942e-08, "logits/chosen": -2.0630977153778076, "logits/rejected": -2.066196918487549, "logps/chosen": -0.762168288230896, "logps/rejected": -6.512924671173096, "loss": 0.4058, "rewards/accuracies": 1.0, "rewards/chosen": 0.9511686563491821, "rewards/margins": 0.6922169327735901, "rewards/rejected": 0.25895172357559204, "step": 2507 }, { "epoch": 1.35, "learning_rate": 7.709964518558836e-08, "logits/chosen": -2.053370237350464, "logits/rejected": -2.042854070663452, "logps/chosen": -12.128154754638672, "logps/rejected": -4.188307285308838, "loss": 0.4015, "rewards/accuracies": 1.0, "rewards/chosen": 1.141985535621643, "rewards/margins": 0.7049591541290283, "rewards/rejected": 0.43702635169029236, "step": 2508 }, { "epoch": 1.35, "learning_rate": 7.708129044479802e-08, "logits/chosen": -2.170658826828003, "logits/rejected": -2.326558828353882, "logps/chosen": -0.69466632604599, "logps/rejected": -0.6480119824409485, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 1.0622714757919312, "rewards/margins": -0.005805015563964844, "rewards/rejected": 1.068076491355896, "step": 2509 }, { "epoch": 1.35, "learning_rate": 7.706293053807227e-08, "logits/chosen": -2.0228025913238525, "logits/rejected": -2.218801736831665, "logps/chosen": -4.102932929992676, "logps/rejected": -3.739187717437744, "loss": 0.6982, "rewards/accuracies": 0.0, "rewards/chosen": 0.6228684782981873, "rewards/margins": -0.010138988494873047, "rewards/rejected": 0.6330074667930603, "step": 2510 }, { "epoch": 1.35, "learning_rate": 7.704456546891336e-08, "logits/chosen": -2.0535998344421387, "logits/rejected": -2.2548563480377197, "logps/chosen": -0.6148293018341064, "logps/rejected": -0.5803157687187195, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8680291175842285, "rewards/margins": 0.008784651756286621, "rewards/rejected": 0.8592444658279419, "step": 2511 }, { "epoch": 1.35, "learning_rate": 7.702619524082457e-08, "logits/chosen": -2.04695987701416, "logits/rejected": -2.045823335647583, "logps/chosen": -6.259912490844727, "logps/rejected": -6.474740982055664, "loss": 0.3893, "rewards/accuracies": 1.0, "rewards/chosen": 1.1807564496994019, "rewards/margins": 0.7424697875976562, "rewards/rejected": 0.438286691904068, "step": 2512 }, { "epoch": 1.36, "learning_rate": 7.700781985731013e-08, "logits/chosen": -2.0309367179870605, "logits/rejected": -2.234062671661377, "logps/chosen": -5.741998672485352, "logps/rejected": -0.9967154264450073, "loss": 0.8155, "rewards/accuracies": 0.0, "rewards/chosen": 0.6393634676933289, "rewards/margins": -0.23140937089920044, "rewards/rejected": 0.8707728385925293, "step": 2513 }, { "epoch": 1.36, "learning_rate": 7.698943932187527e-08, "logits/chosen": -2.15681791305542, "logits/rejected": -2.1378190517425537, "logps/chosen": -13.15598201751709, "logps/rejected": -1.637357234954834, "loss": 0.4197, "rewards/accuracies": 1.0, "rewards/chosen": 1.5812859535217285, "rewards/margins": 0.6510229706764221, "rewards/rejected": 0.9302629828453064, "step": 2514 }, { "epoch": 1.36, "learning_rate": 7.697105363802618e-08, "logits/chosen": -2.1009738445281982, "logits/rejected": -2.2157340049743652, "logps/chosen": -0.5613344311714172, "logps/rejected": -0.6705487370491028, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.9109379053115845, "rewards/margins": 0.009170353412628174, "rewards/rejected": 0.9017675518989563, "step": 2515 }, { "epoch": 1.36, "learning_rate": 7.695266280927008e-08, "logits/chosen": -1.9762728214263916, "logits/rejected": -1.9675583839416504, "logps/chosen": -10.086807250976562, "logps/rejected": -10.24117660522461, "loss": 0.5312, "rewards/accuracies": 1.0, "rewards/chosen": 1.4413197040557861, "rewards/margins": 0.355316162109375, "rewards/rejected": 1.0860035419464111, "step": 2516 }, { "epoch": 1.36, "learning_rate": 7.693426683911515e-08, "logits/chosen": -2.1060585975646973, "logits/rejected": -2.24983549118042, "logps/chosen": -0.8506674766540527, "logps/rejected": -0.7941759824752808, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.8974620699882507, "rewards/margins": 0.011742174625396729, "rewards/rejected": 0.885719895362854, "step": 2517 }, { "epoch": 1.36, "learning_rate": 7.691586573107051e-08, "logits/chosen": -2.0304479598999023, "logits/rejected": -2.270824432373047, "logps/chosen": -0.634843111038208, "logps/rejected": -0.6138419508934021, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.8482638597488403, "rewards/margins": 0.016083061695098877, "rewards/rejected": 0.8321807980537415, "step": 2518 }, { "epoch": 1.36, "learning_rate": 7.68974594886463e-08, "logits/chosen": -2.0926249027252197, "logits/rejected": -2.1916210651397705, "logps/chosen": -1.2270042896270752, "logps/rejected": -1.4368913173675537, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.8941581845283508, "rewards/margins": 0.0062133073806762695, "rewards/rejected": 0.8879448771476746, "step": 2519 }, { "epoch": 1.36, "learning_rate": 7.687904811535364e-08, "logits/chosen": -2.1270666122436523, "logits/rejected": -2.134106159210205, "logps/chosen": -1.5457268953323364, "logps/rejected": -4.270464897155762, "loss": 0.3798, "rewards/accuracies": 1.0, "rewards/chosen": 1.3439470529556274, "rewards/margins": 0.7722387313842773, "rewards/rejected": 0.5717083215713501, "step": 2520 }, { "epoch": 1.36, "learning_rate": 7.686063161470462e-08, "logits/chosen": -2.101936101913452, "logits/rejected": -2.1125597953796387, "logps/chosen": -1.488630771636963, "logps/rejected": -2.1595277786254883, "loss": 0.5002, "rewards/accuracies": 1.0, "rewards/chosen": 1.19367253780365, "rewards/margins": 0.4323626160621643, "rewards/rejected": 0.7613099217414856, "step": 2521 }, { "epoch": 1.36, "learning_rate": 7.684220999021234e-08, "logits/chosen": -1.9951844215393066, "logits/rejected": -2.005662441253662, "logps/chosen": -1.7446962594985962, "logps/rejected": -2.487626314163208, "loss": 0.4551, "rewards/accuracies": 1.0, "rewards/chosen": 1.230188012123108, "rewards/margins": 0.5511254668235779, "rewards/rejected": 0.67906254529953, "step": 2522 }, { "epoch": 1.36, "learning_rate": 7.682378324539079e-08, "logits/chosen": -2.063239812850952, "logits/rejected": -2.3337745666503906, "logps/chosen": -0.2573902904987335, "logps/rejected": -0.26836445927619934, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.8454868197441101, "rewards/margins": 0.017091453075408936, "rewards/rejected": 0.8283953666687012, "step": 2523 }, { "epoch": 1.36, "learning_rate": 7.680535138375501e-08, "logits/chosen": -2.061642646789551, "logits/rejected": -2.06166410446167, "logps/chosen": -1.490119218826294, "logps/rejected": -1.6361613273620605, "loss": 0.5381, "rewards/accuracies": 1.0, "rewards/chosen": 1.0895891189575195, "rewards/margins": 0.3387240767478943, "rewards/rejected": 0.7508650422096252, "step": 2524 }, { "epoch": 1.36, "learning_rate": 7.678691440882104e-08, "logits/chosen": -2.053253173828125, "logits/rejected": -2.053173065185547, "logps/chosen": -1.7168755531311035, "logps/rejected": -0.6678668260574341, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.9154167175292969, "rewards/margins": 0.024973273277282715, "rewards/rejected": 0.8904434442520142, "step": 2525 }, { "epoch": 1.36, "learning_rate": 7.676847232410581e-08, "logits/chosen": -2.1463489532470703, "logits/rejected": -2.1499037742614746, "logps/chosen": -1.7106997966766357, "logps/rejected": -4.648175239562988, "loss": 0.467, "rewards/accuracies": 1.0, "rewards/chosen": 1.0515586137771606, "rewards/margins": 0.5188032388687134, "rewards/rejected": 0.5327553749084473, "step": 2526 }, { "epoch": 1.36, "learning_rate": 7.675002513312728e-08, "logits/chosen": -2.0785272121429443, "logits/rejected": -2.2590551376342773, "logps/chosen": -1.5243487358093262, "logps/rejected": -1.4059772491455078, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 1.0070708990097046, "rewards/margins": 0.0216292142868042, "rewards/rejected": 0.9854416847229004, "step": 2527 }, { "epoch": 1.36, "learning_rate": 7.673157283940438e-08, "logits/chosen": -2.0468780994415283, "logits/rejected": -2.258009672164917, "logps/chosen": -0.989848792552948, "logps/rejected": -0.7173603177070618, "loss": 0.7028, "rewards/accuracies": 0.0, "rewards/chosen": 0.75026535987854, "rewards/margins": -0.019223570823669434, "rewards/rejected": 0.7694889307022095, "step": 2528 }, { "epoch": 1.36, "learning_rate": 7.671311544645701e-08, "logits/chosen": -1.9907156229019165, "logits/rejected": -2.002533435821533, "logps/chosen": -1.1457737684249878, "logps/rejected": -10.296485900878906, "loss": 0.5742, "rewards/accuracies": 1.0, "rewards/chosen": 0.9954291582107544, "rewards/margins": 0.25403088331222534, "rewards/rejected": 0.741398274898529, "step": 2529 }, { "epoch": 1.36, "learning_rate": 7.669465295780604e-08, "logits/chosen": -2.040966749191284, "logits/rejected": -2.0366671085357666, "logps/chosen": -6.860639572143555, "logps/rejected": -4.820283889770508, "loss": 0.2693, "rewards/accuracies": 1.0, "rewards/chosen": 1.6236841678619385, "rewards/margins": 1.1741554737091064, "rewards/rejected": 0.44952869415283203, "step": 2530 }, { "epoch": 1.37, "learning_rate": 7.667618537697331e-08, "logits/chosen": -2.04311203956604, "logits/rejected": -2.0492441654205322, "logps/chosen": -1.8578686714172363, "logps/rejected": -5.262460708618164, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 1.1960314512252808, "rewards/margins": 0.6514435410499573, "rewards/rejected": 0.5445879101753235, "step": 2531 }, { "epoch": 1.37, "learning_rate": 7.665771270748162e-08, "logits/chosen": -2.0904271602630615, "logits/rejected": -2.092700481414795, "logps/chosen": -5.843113899230957, "logps/rejected": -9.342291831970215, "loss": 0.2597, "rewards/accuracies": 1.0, "rewards/chosen": 1.6011943817138672, "rewards/margins": 1.2155808210372925, "rewards/rejected": 0.3856135308742523, "step": 2532 }, { "epoch": 1.37, "learning_rate": 7.663923495285478e-08, "logits/chosen": -2.032201051712036, "logits/rejected": -2.3011529445648193, "logps/chosen": -3.9815361499786377, "logps/rejected": -2.522892713546753, "loss": 0.7204, "rewards/accuracies": 0.0, "rewards/chosen": 0.8263663649559021, "rewards/margins": -0.05386686325073242, "rewards/rejected": 0.8802332282066345, "step": 2533 }, { "epoch": 1.37, "learning_rate": 7.662075211661753e-08, "logits/chosen": -2.0489933490753174, "logits/rejected": -2.313525438308716, "logps/chosen": -0.8603541851043701, "logps/rejected": -0.9196917414665222, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 1.0668238401412964, "rewards/margins": 0.003724813461303711, "rewards/rejected": 1.0630990266799927, "step": 2534 }, { "epoch": 1.37, "learning_rate": 7.660226420229559e-08, "logits/chosen": -2.1306915283203125, "logits/rejected": -2.139127254486084, "logps/chosen": -1.500752568244934, "logps/rejected": -4.319703102111816, "loss": 0.4364, "rewards/accuracies": 1.0, "rewards/chosen": 1.0834507942199707, "rewards/margins": 0.6029545664787292, "rewards/rejected": 0.48049622774124146, "step": 2535 }, { "epoch": 1.37, "learning_rate": 7.658377121341564e-08, "logits/chosen": -2.110368490219116, "logits/rejected": -2.1078367233276367, "logps/chosen": -6.425419807434082, "logps/rejected": -4.074185848236084, "loss": 0.4627, "rewards/accuracies": 1.0, "rewards/chosen": 1.5371135473251343, "rewards/margins": 0.5305027961730957, "rewards/rejected": 1.0066107511520386, "step": 2536 }, { "epoch": 1.37, "learning_rate": 7.656527315350535e-08, "logits/chosen": -2.039461612701416, "logits/rejected": -2.2673416137695312, "logps/chosen": -0.4083077311515808, "logps/rejected": -0.41493216156959534, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.7028733491897583, "rewards/margins": 0.03483551740646362, "rewards/rejected": 0.6680378317832947, "step": 2537 }, { "epoch": 1.37, "learning_rate": 7.654677002609338e-08, "logits/chosen": -2.0876381397247314, "logits/rejected": -2.094541072845459, "logps/chosen": -9.124987602233887, "logps/rejected": -2.672027826309204, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 1.486605167388916, "rewards/margins": 0.8734255433082581, "rewards/rejected": 0.613179624080658, "step": 2538 }, { "epoch": 1.37, "learning_rate": 7.652826183470928e-08, "logits/chosen": -2.0323290824890137, "logits/rejected": -2.2373571395874023, "logps/chosen": -3.186049222946167, "logps/rejected": -2.4729816913604736, "loss": 0.7146, "rewards/accuracies": 0.0, "rewards/chosen": 0.7883286476135254, "rewards/margins": -0.04247397184371948, "rewards/rejected": 0.8308026194572449, "step": 2539 }, { "epoch": 1.37, "learning_rate": 7.650974858288363e-08, "logits/chosen": -2.040942430496216, "logits/rejected": -2.0324223041534424, "logps/chosen": -4.912188529968262, "logps/rejected": -4.133311748504639, "loss": 0.283, "rewards/accuracies": 1.0, "rewards/chosen": 1.5743142366409302, "rewards/margins": 1.1176083087921143, "rewards/rejected": 0.45670589804649353, "step": 2540 }, { "epoch": 1.37, "learning_rate": 7.649123027414796e-08, "logits/chosen": -2.1719810962677, "logits/rejected": -2.0911452770233154, "logps/chosen": -35.56239318847656, "logps/rejected": -4.788232326507568, "loss": 0.2392, "rewards/accuracies": 1.0, "rewards/chosen": 1.7229149341583252, "rewards/margins": 1.3084678649902344, "rewards/rejected": 0.4144470691680908, "step": 2541 }, { "epoch": 1.37, "learning_rate": 7.647270691203475e-08, "logits/chosen": -2.1055076122283936, "logits/rejected": -1.9992506504058838, "logps/chosen": -38.84588623046875, "logps/rejected": -4.242805480957031, "loss": 0.1867, "rewards/accuracies": 1.0, "rewards/chosen": 2.2236526012420654, "rewards/margins": 1.5832927227020264, "rewards/rejected": 0.6403598785400391, "step": 2542 }, { "epoch": 1.37, "learning_rate": 7.645417850007745e-08, "logits/chosen": -2.2038650512695312, "logits/rejected": -2.254971981048584, "logps/chosen": -6.124395370483398, "logps/rejected": -18.35414695739746, "loss": 0.3195, "rewards/accuracies": 1.0, "rewards/chosen": 1.3586825132369995, "rewards/margins": 0.9768677353858948, "rewards/rejected": 0.38181477785110474, "step": 2543 }, { "epoch": 1.37, "learning_rate": 7.643564504181047e-08, "logits/chosen": -1.950921893119812, "logits/rejected": -1.960873007774353, "logps/chosen": -1.8161276578903198, "logps/rejected": -7.49299430847168, "loss": 0.4255, "rewards/accuracies": 1.0, "rewards/chosen": 1.1789259910583496, "rewards/margins": 0.6343428492546082, "rewards/rejected": 0.5445831418037415, "step": 2544 }, { "epoch": 1.37, "learning_rate": 7.641710654076922e-08, "logits/chosen": -2.01847243309021, "logits/rejected": -2.0186808109283447, "logps/chosen": -2.0142250061035156, "logps/rejected": -5.393263339996338, "loss": 0.3339, "rewards/accuracies": 1.0, "rewards/chosen": 1.504709243774414, "rewards/margins": 0.9252529740333557, "rewards/rejected": 0.5794562697410583, "step": 2545 }, { "epoch": 1.37, "learning_rate": 7.639856300049002e-08, "logits/chosen": -1.953956127166748, "logits/rejected": -1.9439719915390015, "logps/chosen": -0.8038508892059326, "logps/rejected": -4.316073417663574, "loss": 0.4915, "rewards/accuracies": 1.0, "rewards/chosen": 1.1418298482894897, "rewards/margins": 0.454451322555542, "rewards/rejected": 0.6873785257339478, "step": 2546 }, { "epoch": 1.37, "learning_rate": 7.638001442451019e-08, "logits/chosen": -2.0911178588867188, "logits/rejected": -2.279618978500366, "logps/chosen": -0.6842411756515503, "logps/rejected": -0.5818505883216858, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 1.0118381977081299, "rewards/margins": 0.028604567050933838, "rewards/rejected": 0.983233630657196, "step": 2547 }, { "epoch": 1.37, "learning_rate": 7.636146081636796e-08, "logits/chosen": -2.134891986846924, "logits/rejected": -2.130326986312866, "logps/chosen": -2.9213674068450928, "logps/rejected": -4.598830223083496, "loss": 0.4948, "rewards/accuracies": 1.0, "rewards/chosen": 1.0472488403320312, "rewards/margins": 0.44596707820892334, "rewards/rejected": 0.6012817621231079, "step": 2548 }, { "epoch": 1.37, "learning_rate": 7.634290217960258e-08, "logits/chosen": -2.0862433910369873, "logits/rejected": -2.2460951805114746, "logps/chosen": -0.5573434829711914, "logps/rejected": -7.521070957183838, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 0.8663583993911743, "rewards/margins": 0.14812636375427246, "rewards/rejected": 0.7182320356369019, "step": 2549 }, { "epoch": 1.38, "learning_rate": 7.632433851775422e-08, "logits/chosen": -1.9871114492416382, "logits/rejected": -1.9230875968933105, "logps/chosen": -16.918109893798828, "logps/rejected": -1.5162084102630615, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 1.6675548553466797, "rewards/margins": 0.7616305947303772, "rewards/rejected": 0.9059242606163025, "step": 2550 }, { "epoch": 1.38, "learning_rate": 7.630576983436404e-08, "logits/chosen": -2.076282024383545, "logits/rejected": -2.0676958560943604, "logps/chosen": -7.500617504119873, "logps/rejected": -6.459981918334961, "loss": 0.2679, "rewards/accuracies": 1.0, "rewards/chosen": 1.6226938962936401, "rewards/margins": 1.1800577640533447, "rewards/rejected": 0.442636102437973, "step": 2551 }, { "epoch": 1.38, "learning_rate": 7.628719613297412e-08, "logits/chosen": -1.9964643716812134, "logits/rejected": -2.2206106185913086, "logps/chosen": -0.9362214803695679, "logps/rejected": -0.9587066173553467, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.864477813243866, "rewards/margins": 0.0034752488136291504, "rewards/rejected": 0.8610025644302368, "step": 2552 }, { "epoch": 1.38, "learning_rate": 7.626861741712755e-08, "logits/chosen": -1.976702332496643, "logits/rejected": -1.9742801189422607, "logps/chosen": -5.694979667663574, "logps/rejected": -2.862668037414551, "loss": 0.3027, "rewards/accuracies": 1.0, "rewards/chosen": 1.6783764362335205, "rewards/margins": 1.0399820804595947, "rewards/rejected": 0.6383943557739258, "step": 2553 }, { "epoch": 1.38, "learning_rate": 7.625003369036831e-08, "logits/chosen": -2.0033905506134033, "logits/rejected": -2.2922346591949463, "logps/chosen": -0.6207816004753113, "logps/rejected": -0.6658748388290405, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.8573893904685974, "rewards/margins": 0.024314165115356445, "rewards/rejected": 0.833075225353241, "step": 2554 }, { "epoch": 1.38, "learning_rate": 7.623144495624135e-08, "logits/chosen": -2.0418760776519775, "logits/rejected": -2.0431222915649414, "logps/chosen": -1.7226601839065552, "logps/rejected": -0.9900423884391785, "loss": 0.6152, "rewards/accuracies": 1.0, "rewards/chosen": 0.9513433575630188, "rewards/margins": 0.16238081455230713, "rewards/rejected": 0.7889625430107117, "step": 2555 }, { "epoch": 1.38, "learning_rate": 7.621285121829264e-08, "logits/chosen": -2.0850346088409424, "logits/rejected": -2.0051138401031494, "logps/chosen": -42.164058685302734, "logps/rejected": -4.848705768585205, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 1.6274410486221313, "rewards/margins": 1.3777437210083008, "rewards/rejected": 0.24969731271266937, "step": 2556 }, { "epoch": 1.38, "learning_rate": 7.619425248006903e-08, "logits/chosen": -2.0550177097320557, "logits/rejected": -2.3017899990081787, "logps/chosen": -0.32673364877700806, "logps/rejected": -0.34967440366744995, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.7571606040000916, "rewards/margins": 0.00595468282699585, "rewards/rejected": 0.7512059211730957, "step": 2557 }, { "epoch": 1.38, "learning_rate": 7.617564874511837e-08, "logits/chosen": -2.02347731590271, "logits/rejected": -2.263181209564209, "logps/chosen": -1.065014123916626, "logps/rejected": -1.0405255556106567, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9551091194152832, "rewards/margins": 0.013501465320587158, "rewards/rejected": 0.941607654094696, "step": 2558 }, { "epoch": 1.38, "learning_rate": 7.615704001698944e-08, "logits/chosen": -1.9638093709945679, "logits/rejected": -2.2636213302612305, "logps/chosen": -2.718755006790161, "logps/rejected": -2.8731038570404053, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.552396297454834, "rewards/margins": 0.019410312175750732, "rewards/rejected": 0.5329859852790833, "step": 2559 }, { "epoch": 1.38, "learning_rate": 7.613842629923198e-08, "logits/chosen": -2.1170105934143066, "logits/rejected": -2.120025396347046, "logps/chosen": -1.883105754852295, "logps/rejected": -5.1989593505859375, "loss": 0.3868, "rewards/accuracies": 1.0, "rewards/chosen": 1.180140733718872, "rewards/margins": 0.7501479387283325, "rewards/rejected": 0.42999276518821716, "step": 2560 }, { "epoch": 1.38, "learning_rate": 7.611980759539666e-08, "logits/chosen": -2.1283023357391357, "logits/rejected": -2.31431245803833, "logps/chosen": -2.4999067783355713, "logps/rejected": -2.1238133907318115, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.8077608346939087, "rewards/margins": -0.012316286563873291, "rewards/rejected": 0.820077121257782, "step": 2561 }, { "epoch": 1.38, "learning_rate": 7.610118390903515e-08, "logits/chosen": -2.0248658657073975, "logits/rejected": -2.251272678375244, "logps/chosen": -1.3181850910186768, "logps/rejected": -1.4899518489837646, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.7502312660217285, "rewards/margins": 0.026862025260925293, "rewards/rejected": 0.7233692407608032, "step": 2562 }, { "epoch": 1.38, "learning_rate": 7.608255524370002e-08, "logits/chosen": -2.1051363945007324, "logits/rejected": -2.102851629257202, "logps/chosen": -8.747482299804688, "logps/rejected": -3.656956672668457, "loss": 0.4403, "rewards/accuracies": 1.0, "rewards/chosen": 1.1505367755889893, "rewards/margins": 0.592099130153656, "rewards/rejected": 0.5584376454353333, "step": 2563 }, { "epoch": 1.38, "learning_rate": 7.606392160294482e-08, "logits/chosen": -2.0315113067626953, "logits/rejected": -2.2593395709991455, "logps/chosen": -2.6713483333587646, "logps/rejected": -2.8073818683624268, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.8351207971572876, "rewards/margins": 0.03826409578323364, "rewards/rejected": 0.796856701374054, "step": 2564 }, { "epoch": 1.38, "learning_rate": 7.604528299032404e-08, "logits/chosen": -2.061436176300049, "logits/rejected": -2.2900500297546387, "logps/chosen": -2.0752177238464355, "logps/rejected": -2.2600488662719727, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9079328775405884, "rewards/margins": 0.02632683515548706, "rewards/rejected": 0.8816060423851013, "step": 2565 }, { "epoch": 1.38, "learning_rate": 7.60266394093931e-08, "logits/chosen": -2.0703511238098145, "logits/rejected": -2.215498924255371, "logps/chosen": -0.21601323783397675, "logps/rejected": -0.17487657070159912, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.6662006378173828, "rewards/margins": 0.02181875705718994, "rewards/rejected": 0.6443818807601929, "step": 2566 }, { "epoch": 1.38, "learning_rate": 7.600799086370841e-08, "logits/chosen": -2.035393238067627, "logits/rejected": -2.038437843322754, "logps/chosen": -2.6494030952453613, "logps/rejected": -3.373563289642334, "loss": 0.369, "rewards/accuracies": 1.0, "rewards/chosen": 1.2206496000289917, "rewards/margins": 0.8066757917404175, "rewards/rejected": 0.41397377848625183, "step": 2567 }, { "epoch": 1.39, "learning_rate": 7.598933735682733e-08, "logits/chosen": -2.055138349533081, "logits/rejected": -2.300177574157715, "logps/chosen": -0.5837457776069641, "logps/rejected": -0.4957961440086365, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.9524874091148376, "rewards/margins": 0.03247880935668945, "rewards/rejected": 0.9200085997581482, "step": 2568 }, { "epoch": 1.39, "learning_rate": 7.59706788923081e-08, "logits/chosen": -2.1302947998046875, "logits/rejected": -2.1394314765930176, "logps/chosen": -1.9331271648406982, "logps/rejected": -2.650385618209839, "loss": 0.5346, "rewards/accuracies": 1.0, "rewards/chosen": 0.986015260219574, "rewards/margins": 0.3469635248184204, "rewards/rejected": 0.6390517354011536, "step": 2569 }, { "epoch": 1.39, "learning_rate": 7.595201547370993e-08, "logits/chosen": -1.9406992197036743, "logits/rejected": -1.937798023223877, "logps/chosen": -3.331042766571045, "logps/rejected": -6.18599271774292, "loss": 0.5086, "rewards/accuracies": 1.0, "rewards/chosen": 0.7966125011444092, "rewards/margins": 0.41097477078437805, "rewards/rejected": 0.38563773036003113, "step": 2570 }, { "epoch": 1.39, "learning_rate": 7.593334710459303e-08, "logits/chosen": -1.9663875102996826, "logits/rejected": -1.9708598852157593, "logps/chosen": -2.609651803970337, "logps/rejected": -4.6002583503723145, "loss": 0.3534, "rewards/accuracies": 1.0, "rewards/chosen": 1.3774055242538452, "rewards/margins": 0.8581206202507019, "rewards/rejected": 0.5192849040031433, "step": 2571 }, { "epoch": 1.39, "learning_rate": 7.591467378851848e-08, "logits/chosen": -2.0533745288848877, "logits/rejected": -2.0604946613311768, "logps/chosen": -4.2192583084106445, "logps/rejected": -6.119122505187988, "loss": 0.4013, "rewards/accuracies": 1.0, "rewards/chosen": 1.5735691785812378, "rewards/margins": 0.705610454082489, "rewards/rejected": 0.8679587244987488, "step": 2572 }, { "epoch": 1.39, "learning_rate": 7.589599552904838e-08, "logits/chosen": -1.9729580879211426, "logits/rejected": -1.972678542137146, "logps/chosen": -3.6776721477508545, "logps/rejected": -3.7562925815582275, "loss": 0.257, "rewards/accuracies": 1.0, "rewards/chosen": 1.738932490348816, "rewards/margins": 1.2273424863815308, "rewards/rejected": 0.5115900039672852, "step": 2573 }, { "epoch": 1.39, "learning_rate": 7.58773123297457e-08, "logits/chosen": -2.1341679096221924, "logits/rejected": -2.1342978477478027, "logps/chosen": -2.00760817527771, "logps/rejected": -3.840841293334961, "loss": 0.2798, "rewards/accuracies": 1.0, "rewards/chosen": 1.5856837034225464, "rewards/margins": 1.13052499294281, "rewards/rejected": 0.45515871047973633, "step": 2574 }, { "epoch": 1.39, "learning_rate": 7.585862419417439e-08, "logits/chosen": -1.996760606765747, "logits/rejected": -2.2977187633514404, "logps/chosen": -2.4233219623565674, "logps/rejected": -2.5336527824401855, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 1.2294639348983765, "rewards/margins": 0.03295087814331055, "rewards/rejected": 1.196513056755066, "step": 2575 }, { "epoch": 1.39, "learning_rate": 7.583993112589935e-08, "logits/chosen": -2.007535696029663, "logits/rejected": -2.268620729446411, "logps/chosen": -0.3074585199356079, "logps/rejected": -0.338626503944397, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": 0.9826361536979675, "rewards/margins": 0.03890770673751831, "rewards/rejected": 0.9437284469604492, "step": 2576 }, { "epoch": 1.39, "learning_rate": 7.58212331284864e-08, "logits/chosen": -2.1024439334869385, "logits/rejected": -2.10575008392334, "logps/chosen": -2.847075939178467, "logps/rejected": -3.8944108486175537, "loss": 0.4333, "rewards/accuracies": 1.0, "rewards/chosen": 1.0900477170944214, "rewards/margins": 0.6119928359985352, "rewards/rejected": 0.4780549108982086, "step": 2577 }, { "epoch": 1.39, "learning_rate": 7.580253020550229e-08, "logits/chosen": -2.01322865486145, "logits/rejected": -2.0122838020324707, "logps/chosen": -0.5661492347717285, "logps/rejected": -3.0979292392730713, "loss": 0.5102, "rewards/accuracies": 1.0, "rewards/chosen": 1.0368196964263916, "rewards/margins": 0.406973659992218, "rewards/rejected": 0.6298460364341736, "step": 2578 }, { "epoch": 1.39, "learning_rate": 7.578382236051473e-08, "logits/chosen": -2.014241933822632, "logits/rejected": -2.0274784564971924, "logps/chosen": -1.5566844940185547, "logps/rejected": -7.462024688720703, "loss": 0.4183, "rewards/accuracies": 1.0, "rewards/chosen": 1.119760513305664, "rewards/margins": 0.6550873517990112, "rewards/rejected": 0.46467313170433044, "step": 2579 }, { "epoch": 1.39, "learning_rate": 7.576510959709239e-08, "logits/chosen": -2.0167648792266846, "logits/rejected": -2.22015643119812, "logps/chosen": -5.662900447845459, "logps/rejected": -1.430558204650879, "loss": 0.7395, "rewards/accuracies": 0.0, "rewards/chosen": 0.7956905961036682, "rewards/margins": -0.09057193994522095, "rewards/rejected": 0.8862625360488892, "step": 2580 }, { "epoch": 1.39, "learning_rate": 7.574639191880483e-08, "logits/chosen": -2.119469165802002, "logits/rejected": -1.9404345750808716, "logps/chosen": -37.90137481689453, "logps/rejected": -3.2909326553344727, "loss": 0.2408, "rewards/accuracies": 1.0, "rewards/chosen": 1.9330066442489624, "rewards/margins": 1.3008038997650146, "rewards/rejected": 0.6322027444839478, "step": 2581 }, { "epoch": 1.39, "learning_rate": 7.572766932922259e-08, "logits/chosen": -2.2204179763793945, "logits/rejected": -2.2143540382385254, "logps/chosen": -3.8750216960906982, "logps/rejected": -1.8223769664764404, "loss": 0.5256, "rewards/accuracies": 1.0, "rewards/chosen": 1.1012375354766846, "rewards/margins": 0.368897020816803, "rewards/rejected": 0.7323405146598816, "step": 2582 }, { "epoch": 1.39, "learning_rate": 7.57089418319171e-08, "logits/chosen": -2.266308069229126, "logits/rejected": -2.1811017990112305, "logps/chosen": -42.271339416503906, "logps/rejected": -6.054525852203369, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 2.0478463172912598, "rewards/margins": 1.2870416641235352, "rewards/rejected": 0.7608047127723694, "step": 2583 }, { "epoch": 1.39, "learning_rate": 7.569020943046078e-08, "logits/chosen": -2.084104537963867, "logits/rejected": -2.279695749282837, "logps/chosen": -1.0054188966751099, "logps/rejected": -1.121710181236267, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.732266366481781, "rewards/margins": 0.03590047359466553, "rewards/rejected": 0.6963658928871155, "step": 2584 }, { "epoch": 1.39, "learning_rate": 7.567147212842694e-08, "logits/chosen": -2.0284132957458496, "logits/rejected": -2.0342044830322266, "logps/chosen": -4.17999267578125, "logps/rejected": -5.374990463256836, "loss": 0.4338, "rewards/accuracies": 1.0, "rewards/chosen": 0.96551513671875, "rewards/margins": 0.6104798316955566, "rewards/rejected": 0.35503530502319336, "step": 2585 }, { "epoch": 1.39, "learning_rate": 7.565272992938987e-08, "logits/chosen": -2.145843744277954, "logits/rejected": -2.198807954788208, "logps/chosen": -5.763772487640381, "logps/rejected": -8.54773998260498, "loss": 0.5163, "rewards/accuracies": 1.0, "rewards/chosen": 1.2514610290527344, "rewards/margins": 0.39175063371658325, "rewards/rejected": 0.8597103953361511, "step": 2586 }, { "epoch": 1.4, "learning_rate": 7.563398283692475e-08, "logits/chosen": -2.002908706665039, "logits/rejected": -2.197983741760254, "logps/chosen": -1.721278429031372, "logps/rejected": -1.7593873739242554, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9644597172737122, "rewards/margins": 0.01879197359085083, "rewards/rejected": 0.9456677436828613, "step": 2587 }, { "epoch": 1.4, "learning_rate": 7.56152308546077e-08, "logits/chosen": -2.068554162979126, "logits/rejected": -2.0741147994995117, "logps/chosen": -2.639909267425537, "logps/rejected": -5.462691783905029, "loss": 0.448, "rewards/accuracies": 1.0, "rewards/chosen": 1.0270490646362305, "rewards/margins": 0.5707173347473145, "rewards/rejected": 0.45633170008659363, "step": 2588 }, { "epoch": 1.4, "learning_rate": 7.559647398601581e-08, "logits/chosen": -2.0139715671539307, "logits/rejected": -2.330933094024658, "logps/chosen": -0.6760424971580505, "logps/rejected": -0.7030742764472961, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.945530354976654, "rewards/margins": 0.014687001705169678, "rewards/rejected": 0.9308433532714844, "step": 2589 }, { "epoch": 1.4, "learning_rate": 7.557771223472707e-08, "logits/chosen": -2.151394844055176, "logits/rejected": -2.152724027633667, "logps/chosen": -3.960787773132324, "logps/rejected": -3.4093146324157715, "loss": 0.5036, "rewards/accuracies": 1.0, "rewards/chosen": 1.0394748449325562, "rewards/margins": 0.42360228300094604, "rewards/rejected": 0.6158725619316101, "step": 2590 }, { "epoch": 1.4, "learning_rate": 7.555894560432041e-08, "logits/chosen": -2.1039018630981445, "logits/rejected": -2.225595235824585, "logps/chosen": -2.411606788635254, "logps/rejected": -3.6244170665740967, "loss": 0.5977, "rewards/accuracies": 1.0, "rewards/chosen": 0.9301800727844238, "rewards/margins": 0.20102328062057495, "rewards/rejected": 0.7291567921638489, "step": 2591 }, { "epoch": 1.4, "learning_rate": 7.554017409837567e-08, "logits/chosen": -2.1984167098999023, "logits/rejected": -2.204796075820923, "logps/chosen": -1.424438714981079, "logps/rejected": -2.9273734092712402, "loss": 0.4452, "rewards/accuracies": 1.0, "rewards/chosen": 1.1365966796875, "rewards/margins": 0.5782912373542786, "rewards/rejected": 0.5583054423332214, "step": 2592 }, { "epoch": 1.4, "learning_rate": 7.552139772047368e-08, "logits/chosen": -2.112820863723755, "logits/rejected": -2.3070807456970215, "logps/chosen": -2.264155387878418, "logps/rejected": -2.284364700317383, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.6495513916015625, "rewards/margins": 0.0037410855293273926, "rewards/rejected": 0.6458103060722351, "step": 2593 }, { "epoch": 1.4, "learning_rate": 7.550261647419609e-08, "logits/chosen": -2.0533251762390137, "logits/rejected": -2.253143072128296, "logps/chosen": -0.3047564625740051, "logps/rejected": -0.40348029136657715, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.8531033396720886, "rewards/margins": 0.007837831974029541, "rewards/rejected": 0.8452655076980591, "step": 2594 }, { "epoch": 1.4, "learning_rate": 7.548383036312561e-08, "logits/chosen": -2.2498602867126465, "logits/rejected": -2.149899959564209, "logps/chosen": -28.15613555908203, "logps/rejected": -4.043148517608643, "loss": 0.2393, "rewards/accuracies": 1.0, "rewards/chosen": 1.766976237297058, "rewards/margins": 1.3080435991287231, "rewards/rejected": 0.45893263816833496, "step": 2595 }, { "epoch": 1.4, "learning_rate": 7.546503939084578e-08, "logits/chosen": -2.0512053966522217, "logits/rejected": -2.2809033393859863, "logps/chosen": -4.201961517333984, "logps/rejected": -0.976357102394104, "loss": 0.8359, "rewards/accuracies": 0.0, "rewards/chosen": 0.8269401788711548, "rewards/margins": -0.26766037940979004, "rewards/rejected": 1.0946005582809448, "step": 2596 }, { "epoch": 1.4, "learning_rate": 7.544624356094113e-08, "logits/chosen": -2.037879705429077, "logits/rejected": -2.025691032409668, "logps/chosen": -2.160773515701294, "logps/rejected": -5.342173099517822, "loss": 0.4539, "rewards/accuracies": 1.0, "rewards/chosen": 1.3838392496109009, "rewards/margins": 0.5543197989463806, "rewards/rejected": 0.8295194506645203, "step": 2597 }, { "epoch": 1.4, "learning_rate": 7.542744287699709e-08, "logits/chosen": -1.9716638326644897, "logits/rejected": -2.2379541397094727, "logps/chosen": -0.2304067760705948, "logps/rejected": -0.25999361276626587, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.8382502794265747, "rewards/margins": 0.020580589771270752, "rewards/rejected": 0.817669689655304, "step": 2598 }, { "epoch": 1.4, "learning_rate": 7.540863734259998e-08, "logits/chosen": -1.9959741830825806, "logits/rejected": -2.2669708728790283, "logps/chosen": -0.4692404866218567, "logps/rejected": -0.48264288902282715, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.9714867472648621, "rewards/margins": 0.015995681285858154, "rewards/rejected": 0.9554910659790039, "step": 2599 }, { "epoch": 1.4, "learning_rate": 7.53898269613371e-08, "logits/chosen": -2.0597312450408936, "logits/rejected": -2.288811445236206, "logps/chosen": -0.9626386761665344, "logps/rejected": -1.1060123443603516, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.7351673245429993, "rewards/margins": 0.002190887928009033, "rewards/rejected": 0.7329764366149902, "step": 2600 }, { "epoch": 1.4, "learning_rate": 7.537101173679666e-08, "logits/chosen": -2.1476240158081055, "logits/rejected": -2.142191171646118, "logps/chosen": -2.4863808155059814, "logps/rejected": -2.8600234985351562, "loss": 0.527, "rewards/accuracies": 1.0, "rewards/chosen": 1.1385834217071533, "rewards/margins": 0.3656013011932373, "rewards/rejected": 0.772982120513916, "step": 2601 }, { "epoch": 1.4, "learning_rate": 7.535219167256776e-08, "logits/chosen": -2.0711820125579834, "logits/rejected": -2.0957117080688477, "logps/chosen": -3.205254077911377, "logps/rejected": -6.088403701782227, "loss": 0.4776, "rewards/accuracies": 1.0, "rewards/chosen": 1.1738232374191284, "rewards/margins": 0.4907761216163635, "rewards/rejected": 0.6830471158027649, "step": 2602 }, { "epoch": 1.4, "learning_rate": 7.533336677224051e-08, "logits/chosen": -2.032853364944458, "logits/rejected": -2.011866569519043, "logps/chosen": -7.208702087402344, "logps/rejected": -6.333214282989502, "loss": 0.3629, "rewards/accuracies": 1.0, "rewards/chosen": 1.3137091398239136, "rewards/margins": 0.8268475532531738, "rewards/rejected": 0.48686155676841736, "step": 2603 }, { "epoch": 1.4, "learning_rate": 7.531453703940584e-08, "logits/chosen": -2.149723768234253, "logits/rejected": -2.2916202545166016, "logps/chosen": -0.49415919184684753, "logps/rejected": -0.4929582476615906, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 1.0533133745193481, "rewards/margins": -0.0003635883331298828, "rewards/rejected": 1.053676962852478, "step": 2604 }, { "epoch": 1.41, "learning_rate": 7.529570247765564e-08, "logits/chosen": -2.129082202911377, "logits/rejected": -2.1313040256500244, "logps/chosen": -0.9006062746047974, "logps/rejected": -2.826646327972412, "loss": 0.5464, "rewards/accuracies": 1.0, "rewards/chosen": 1.0681464672088623, "rewards/margins": 0.31868237257003784, "rewards/rejected": 0.7494640946388245, "step": 2605 }, { "epoch": 1.41, "learning_rate": 7.527686309058275e-08, "logits/chosen": -2.050577163696289, "logits/rejected": -2.2915937900543213, "logps/chosen": -1.3452472686767578, "logps/rejected": -1.309403657913208, "loss": 0.6732, "rewards/accuracies": 1.0, "rewards/chosen": 0.972163200378418, "rewards/margins": 0.04035043716430664, "rewards/rejected": 0.9318127632141113, "step": 2606 }, { "epoch": 1.41, "learning_rate": 7.525801888178091e-08, "logits/chosen": -2.201251983642578, "logits/rejected": -2.294191837310791, "logps/chosen": -13.437718391418457, "logps/rejected": -8.873114585876465, "loss": 0.799, "rewards/accuracies": 0.0, "rewards/chosen": 0.6895982027053833, "rewards/margins": -0.20164906978607178, "rewards/rejected": 0.8912472724914551, "step": 2607 }, { "epoch": 1.41, "learning_rate": 7.523916985484476e-08, "logits/chosen": -1.9670134782791138, "logits/rejected": -2.235588788986206, "logps/chosen": -1.5217840671539307, "logps/rejected": -3.8443288803100586, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 1.0189136266708374, "rewards/margins": 0.03952634334564209, "rewards/rejected": 0.9793872833251953, "step": 2608 }, { "epoch": 1.41, "learning_rate": 7.522031601336989e-08, "logits/chosen": -2.079580068588257, "logits/rejected": -2.1789278984069824, "logps/chosen": -1.4533902406692505, "logps/rejected": -27.365684509277344, "loss": 0.2495, "rewards/accuracies": 1.0, "rewards/chosen": 1.1277164220809937, "rewards/margins": 1.2609609365463257, "rewards/rejected": -0.13324451446533203, "step": 2609 }, { "epoch": 1.41, "learning_rate": 7.520145736095278e-08, "logits/chosen": -2.0392496585845947, "logits/rejected": -2.288484573364258, "logps/chosen": -0.2288024127483368, "logps/rejected": -0.2277076542377472, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.9469013214111328, "rewards/margins": 0.009526729583740234, "rewards/rejected": 0.9373745918273926, "step": 2610 }, { "epoch": 1.41, "learning_rate": 7.518259390119086e-08, "logits/chosen": -2.155388832092285, "logits/rejected": -2.3286261558532715, "logps/chosen": -0.8025959134101868, "logps/rejected": -3.2150893211364746, "loss": 0.6717, "rewards/accuracies": 1.0, "rewards/chosen": 1.0255374908447266, "rewards/margins": 0.043419063091278076, "rewards/rejected": 0.9821184277534485, "step": 2611 }, { "epoch": 1.41, "learning_rate": 7.516372563768243e-08, "logits/chosen": -2.1525115966796875, "logits/rejected": -2.09551739692688, "logps/chosen": -22.334009170532227, "logps/rejected": -3.303316593170166, "loss": 0.2202, "rewards/accuracies": 1.0, "rewards/chosen": 2.16818904876709, "rewards/margins": 1.4013524055480957, "rewards/rejected": 0.7668367028236389, "step": 2612 }, { "epoch": 1.41, "learning_rate": 7.514485257402679e-08, "logits/chosen": -2.0214810371398926, "logits/rejected": -2.271571159362793, "logps/chosen": -1.9700287580490112, "logps/rejected": -4.66929817199707, "loss": 0.5972, "rewards/accuracies": 1.0, "rewards/chosen": 0.8763251304626465, "rewards/margins": 0.20210236310958862, "rewards/rejected": 0.6742227673530579, "step": 2613 }, { "epoch": 1.41, "learning_rate": 7.512597471382406e-08, "logits/chosen": -2.1068115234375, "logits/rejected": -2.217094659805298, "logps/chosen": -8.93786334991455, "logps/rejected": -21.693376541137695, "loss": 0.2463, "rewards/accuracies": 1.0, "rewards/chosen": 1.8951473236083984, "rewards/margins": 1.275439977645874, "rewards/rejected": 0.6197072863578796, "step": 2614 }, { "epoch": 1.41, "learning_rate": 7.510709206067534e-08, "logits/chosen": -1.9771987199783325, "logits/rejected": -1.9764823913574219, "logps/chosen": -9.425329208374023, "logps/rejected": -1.3615343570709229, "loss": 0.4796, "rewards/accuracies": 1.0, "rewards/chosen": 1.3525971174240112, "rewards/margins": 0.48555392026901245, "rewards/rejected": 0.8670431971549988, "step": 2615 }, { "epoch": 1.41, "learning_rate": 7.50882046181826e-08, "logits/chosen": -2.0530519485473633, "logits/rejected": -2.2705647945404053, "logps/chosen": -1.10016930103302, "logps/rejected": -1.247501015663147, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 1.0685981512069702, "rewards/margins": 0.02803945541381836, "rewards/rejected": 1.0405586957931519, "step": 2616 }, { "epoch": 1.41, "learning_rate": 7.506931238994873e-08, "logits/chosen": -2.0684733390808105, "logits/rejected": -2.07568097114563, "logps/chosen": -5.381629943847656, "logps/rejected": -3.058232307434082, "loss": 0.5183, "rewards/accuracies": 1.0, "rewards/chosen": 1.0176106691360474, "rewards/margins": 0.3868131637573242, "rewards/rejected": 0.6307975053787231, "step": 2617 }, { "epoch": 1.41, "learning_rate": 7.505041537957759e-08, "logits/chosen": -2.155057430267334, "logits/rejected": -2.2577455043792725, "logps/chosen": -11.504698753356934, "logps/rejected": -3.681166172027588, "loss": 1.1298, "rewards/accuracies": 0.0, "rewards/chosen": -0.0024641037452965975, "rewards/margins": -0.7395175695419312, "rewards/rejected": 0.7370534539222717, "step": 2618 }, { "epoch": 1.41, "learning_rate": 7.50315135906739e-08, "logits/chosen": -2.0190980434417725, "logits/rejected": -2.0168509483337402, "logps/chosen": -0.886830747127533, "logps/rejected": -4.11370849609375, "loss": 0.5083, "rewards/accuracies": 1.0, "rewards/chosen": 0.9415403604507446, "rewards/margins": 0.4118086099624634, "rewards/rejected": 0.5297317504882812, "step": 2619 }, { "epoch": 1.41, "learning_rate": 7.501260702684328e-08, "logits/chosen": -2.1638894081115723, "logits/rejected": -2.256737232208252, "logps/chosen": -1.9806078672409058, "logps/rejected": -2.1149868965148926, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9963812232017517, "rewards/margins": 0.01853412389755249, "rewards/rejected": 0.9778470993041992, "step": 2620 }, { "epoch": 1.41, "learning_rate": 7.499369569169229e-08, "logits/chosen": -2.0319104194641113, "logits/rejected": -2.0317671298980713, "logps/chosen": -0.3272966146469116, "logps/rejected": -4.244846820831299, "loss": 0.479, "rewards/accuracies": 1.0, "rewards/chosen": 1.0694504976272583, "rewards/margins": 0.4869024157524109, "rewards/rejected": 0.5825480818748474, "step": 2621 }, { "epoch": 1.41, "learning_rate": 7.497477958882841e-08, "logits/chosen": -1.941300868988037, "logits/rejected": -2.2106335163116455, "logps/chosen": -0.2176702320575714, "logps/rejected": -0.27818286418914795, "loss": 0.7005, "rewards/accuracies": 0.0, "rewards/chosen": 0.9265047311782837, "rewards/margins": -0.014572501182556152, "rewards/rejected": 0.9410772323608398, "step": 2622 }, { "epoch": 1.41, "learning_rate": 7.495585872186e-08, "logits/chosen": -2.0606634616851807, "logits/rejected": -2.057204484939575, "logps/chosen": -5.678828716278076, "logps/rejected": -4.7220139503479, "loss": 0.4476, "rewards/accuracies": 1.0, "rewards/chosen": 0.9577323794364929, "rewards/margins": 0.5717180967330933, "rewards/rejected": 0.38601431250572205, "step": 2623 }, { "epoch": 1.42, "learning_rate": 7.493693309439632e-08, "logits/chosen": -2.026883840560913, "logits/rejected": -2.0209195613861084, "logps/chosen": -0.7539205551147461, "logps/rejected": -4.884394645690918, "loss": 0.4676, "rewards/accuracies": 1.0, "rewards/chosen": 1.0619043111801147, "rewards/margins": 0.5173594355583191, "rewards/rejected": 0.5445448756217957, "step": 2624 }, { "epoch": 1.42, "learning_rate": 7.491800271004759e-08, "logits/chosen": -2.0012176036834717, "logits/rejected": -2.198482036590576, "logps/chosen": -1.134178876876831, "logps/rejected": -1.182433009147644, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9270275235176086, "rewards/margins": 0.001465141773223877, "rewards/rejected": 0.9255623817443848, "step": 2625 }, { "epoch": 1.42, "learning_rate": 7.489906757242488e-08, "logits/chosen": -2.0625932216644287, "logits/rejected": -2.2741708755493164, "logps/chosen": -6.7795891761779785, "logps/rejected": -2.9227921962738037, "loss": 0.743, "rewards/accuracies": 0.0, "rewards/chosen": 0.8204145431518555, "rewards/margins": -0.09737414121627808, "rewards/rejected": 0.9177886843681335, "step": 2626 }, { "epoch": 1.42, "learning_rate": 7.488012768514023e-08, "logits/chosen": -2.0166268348693848, "logits/rejected": -2.0226008892059326, "logps/chosen": -1.2561227083206177, "logps/rejected": -4.402317523956299, "loss": 0.412, "rewards/accuracies": 1.0, "rewards/chosen": 0.9536482095718384, "rewards/margins": 0.673635721206665, "rewards/rejected": 0.28001245856285095, "step": 2627 }, { "epoch": 1.42, "learning_rate": 7.48611830518065e-08, "logits/chosen": -2.07727313041687, "logits/rejected": -2.268939733505249, "logps/chosen": -1.2865387201309204, "logps/rejected": -1.4927725791931152, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 0.8234407305717468, "rewards/margins": 0.053534507751464844, "rewards/rejected": 0.769906222820282, "step": 2628 }, { "epoch": 1.42, "learning_rate": 7.484223367603755e-08, "logits/chosen": -2.0190653800964355, "logits/rejected": -2.0185158252716064, "logps/chosen": -2.5216541290283203, "logps/rejected": -5.228680610656738, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": 1.10218346118927, "rewards/margins": 0.5300008654594421, "rewards/rejected": 0.5721825957298279, "step": 2629 }, { "epoch": 1.42, "learning_rate": 7.482327956144807e-08, "logits/chosen": -2.105238437652588, "logits/rejected": -2.0238754749298096, "logps/chosen": -24.266624450683594, "logps/rejected": -2.9859659671783447, "loss": 0.305, "rewards/accuracies": 1.0, "rewards/chosen": 1.6112194061279297, "rewards/margins": 1.0311286449432373, "rewards/rejected": 0.5800907015800476, "step": 2630 }, { "epoch": 1.42, "learning_rate": 7.48043207116537e-08, "logits/chosen": -2.154604434967041, "logits/rejected": -2.2609658241271973, "logps/chosen": -1.0773652791976929, "logps/rejected": -1.1580344438552856, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.8576105237007141, "rewards/margins": 0.005921006202697754, "rewards/rejected": 0.8516895174980164, "step": 2631 }, { "epoch": 1.42, "learning_rate": 7.478535713027094e-08, "logits/chosen": -2.0079212188720703, "logits/rejected": -2.273940086364746, "logps/chosen": -1.901216745376587, "logps/rejected": -0.7451815009117126, "loss": 0.6988, "rewards/accuracies": 0.0, "rewards/chosen": 1.0379996299743652, "rewards/margins": -0.011219382286071777, "rewards/rejected": 1.049219012260437, "step": 2632 }, { "epoch": 1.42, "learning_rate": 7.476638882091724e-08, "logits/chosen": -1.9862109422683716, "logits/rejected": -2.0040440559387207, "logps/chosen": -1.8986005783081055, "logps/rejected": -6.024672508239746, "loss": 0.3899, "rewards/accuracies": 1.0, "rewards/chosen": 1.2422782182693481, "rewards/margins": 0.7404977679252625, "rewards/rejected": 0.5017804503440857, "step": 2633 }, { "epoch": 1.42, "learning_rate": 7.474741578721094e-08, "logits/chosen": -2.033571720123291, "logits/rejected": -2.2570273876190186, "logps/chosen": -0.6311982870101929, "logps/rejected": -0.6403886675834656, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 1.011680006980896, "rewards/margins": 0.03787875175476074, "rewards/rejected": 0.9738012552261353, "step": 2634 }, { "epoch": 1.42, "learning_rate": 7.472843803277126e-08, "logits/chosen": -2.120262622833252, "logits/rejected": -2.114589214324951, "logps/chosen": -4.599976539611816, "logps/rejected": -4.248322486877441, "loss": 0.3057, "rewards/accuracies": 1.0, "rewards/chosen": 1.4373456239700317, "rewards/margins": 1.0284181833267212, "rewards/rejected": 0.40892744064331055, "step": 2635 }, { "epoch": 1.42, "learning_rate": 7.470945556121831e-08, "logits/chosen": -2.1576859951019287, "logits/rejected": -2.085466146469116, "logps/chosen": -31.10909652709961, "logps/rejected": -3.796968460083008, "loss": 0.3642, "rewards/accuracies": 1.0, "rewards/chosen": 1.4342052936553955, "rewards/margins": 0.822387158870697, "rewards/rejected": 0.6118181347846985, "step": 2636 }, { "epoch": 1.42, "learning_rate": 7.469046837617316e-08, "logits/chosen": -2.122727155685425, "logits/rejected": -2.1189303398132324, "logps/chosen": -6.737854480743408, "logps/rejected": -3.372084617614746, "loss": 0.3695, "rewards/accuracies": 1.0, "rewards/chosen": 1.4174031019210815, "rewards/margins": 0.8051119446754456, "rewards/rejected": 0.612291157245636, "step": 2637 }, { "epoch": 1.42, "learning_rate": 7.467147648125772e-08, "logits/chosen": -2.014925956726074, "logits/rejected": -2.0171871185302734, "logps/chosen": -2.225309371948242, "logps/rejected": -4.294111251831055, "loss": 0.2664, "rewards/accuracies": 1.0, "rewards/chosen": 1.688327670097351, "rewards/margins": 1.1863974332809448, "rewards/rejected": 0.5019302368164062, "step": 2638 }, { "epoch": 1.42, "learning_rate": 7.465247988009482e-08, "logits/chosen": -2.069444179534912, "logits/rejected": -2.068854331970215, "logps/chosen": -2.4213762283325195, "logps/rejected": -4.0203728675842285, "loss": 0.5358, "rewards/accuracies": 1.0, "rewards/chosen": 0.8922255635261536, "rewards/margins": 0.34414970874786377, "rewards/rejected": 0.5480758547782898, "step": 2639 }, { "epoch": 1.42, "learning_rate": 7.463347857630821e-08, "logits/chosen": -2.0474092960357666, "logits/rejected": -2.039654493331909, "logps/chosen": -2.577956199645996, "logps/rejected": -5.584183692932129, "loss": 0.33, "rewards/accuracies": 1.0, "rewards/chosen": 1.365293264389038, "rewards/margins": 0.9391916990280151, "rewards/rejected": 0.42610159516334534, "step": 2640 }, { "epoch": 1.42, "learning_rate": 7.461447257352248e-08, "logits/chosen": -2.0725643634796143, "logits/rejected": -2.0697710514068604, "logps/chosen": -2.7443437576293945, "logps/rejected": -5.177354335784912, "loss": 0.5141, "rewards/accuracies": 1.0, "rewards/chosen": 0.9607475399971008, "rewards/margins": 0.39727699756622314, "rewards/rejected": 0.5634705424308777, "step": 2641 }, { "epoch": 1.43, "learning_rate": 7.459546187536316e-08, "logits/chosen": -2.091644763946533, "logits/rejected": -2.101266860961914, "logps/chosen": -3.0088512897491455, "logps/rejected": -3.038151741027832, "loss": 0.477, "rewards/accuracies": 1.0, "rewards/chosen": 0.9834065437316895, "rewards/margins": 0.4921490550041199, "rewards/rejected": 0.4912574887275696, "step": 2642 }, { "epoch": 1.43, "learning_rate": 7.457644648545668e-08, "logits/chosen": -2.207660436630249, "logits/rejected": -2.1932997703552246, "logps/chosen": -10.898066520690918, "logps/rejected": -5.847537994384766, "loss": 0.563, "rewards/accuracies": 1.0, "rewards/chosen": 1.2096368074417114, "rewards/margins": 0.2798563241958618, "rewards/rejected": 0.9297804832458496, "step": 2643 }, { "epoch": 1.43, "learning_rate": 7.455742640743035e-08, "logits/chosen": -2.0852980613708496, "logits/rejected": -2.0855376720428467, "logps/chosen": -0.7100039124488831, "logps/rejected": -10.377281188964844, "loss": 0.5066, "rewards/accuracies": 1.0, "rewards/chosen": 1.0956960916519165, "rewards/margins": 0.41614001989364624, "rewards/rejected": 0.6795560717582703, "step": 2644 }, { "epoch": 1.43, "learning_rate": 7.453840164491236e-08, "logits/chosen": -1.9923951625823975, "logits/rejected": -2.255934715270996, "logps/chosen": -1.2431516647338867, "logps/rejected": -1.2759335041046143, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.8808512687683105, "rewards/margins": 0.009839236736297607, "rewards/rejected": 0.8710120320320129, "step": 2645 }, { "epoch": 1.43, "learning_rate": 7.451937220153183e-08, "logits/chosen": -1.9751814603805542, "logits/rejected": -1.9752099514007568, "logps/chosen": -1.72247314453125, "logps/rejected": -0.841239869594574, "loss": 0.5841, "rewards/accuracies": 1.0, "rewards/chosen": 1.1228008270263672, "rewards/margins": 0.2314799427986145, "rewards/rejected": 0.8913208842277527, "step": 2646 }, { "epoch": 1.43, "learning_rate": 7.450033808091873e-08, "logits/chosen": -2.0610718727111816, "logits/rejected": -2.0662715435028076, "logps/chosen": -3.07427978515625, "logps/rejected": -0.4751042127609253, "loss": 0.5238, "rewards/accuracies": 1.0, "rewards/chosen": 1.2515240907669067, "rewards/margins": 0.3733201026916504, "rewards/rejected": 0.8782039880752563, "step": 2647 }, { "epoch": 1.43, "learning_rate": 7.448129928670397e-08, "logits/chosen": -1.9487682580947876, "logits/rejected": -2.232822895050049, "logps/chosen": -3.319425582885742, "logps/rejected": -3.7533557415008545, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.7006278038024902, "rewards/margins": 0.012323379516601562, "rewards/rejected": 0.6883044242858887, "step": 2648 }, { "epoch": 1.43, "learning_rate": 7.44622558225193e-08, "logits/chosen": -2.072539806365967, "logits/rejected": -2.0603904724121094, "logps/chosen": -3.872257709503174, "logps/rejected": -6.003525257110596, "loss": 0.2742, "rewards/accuracies": 1.0, "rewards/chosen": 1.4818159341812134, "rewards/margins": 1.153503179550171, "rewards/rejected": 0.3283127248287201, "step": 2649 }, { "epoch": 1.43, "learning_rate": 7.444320769199741e-08, "logits/chosen": -2.1466877460479736, "logits/rejected": -2.2648234367370605, "logps/chosen": -4.77357292175293, "logps/rejected": -2.1467838287353516, "loss": 0.6475, "rewards/accuracies": 1.0, "rewards/chosen": 0.9949556589126587, "rewards/margins": 0.09349149465560913, "rewards/rejected": 0.9014641642570496, "step": 2650 }, { "epoch": 1.43, "learning_rate": 7.442415489877185e-08, "logits/chosen": -2.021959066390991, "logits/rejected": -2.0218546390533447, "logps/chosen": -0.9193699955940247, "logps/rejected": -3.68204402923584, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 0.9667919278144836, "rewards/margins": 0.48732051253318787, "rewards/rejected": 0.4794714152812958, "step": 2651 }, { "epoch": 1.43, "learning_rate": 7.440509744647704e-08, "logits/chosen": -2.154961109161377, "logits/rejected": -2.296201229095459, "logps/chosen": -5.608585357666016, "logps/rejected": -3.712613105773926, "loss": 0.6481, "rewards/accuracies": 1.0, "rewards/chosen": 0.6733116507530212, "rewards/margins": 0.09225380420684814, "rewards/rejected": 0.5810578465461731, "step": 2652 }, { "epoch": 1.43, "learning_rate": 7.438603533874836e-08, "logits/chosen": -2.174630641937256, "logits/rejected": -2.304598569869995, "logps/chosen": -0.3692571222782135, "logps/rejected": -0.3496038317680359, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.9000443816184998, "rewards/margins": 0.015597164630889893, "rewards/rejected": 0.8844472169876099, "step": 2653 }, { "epoch": 1.43, "learning_rate": 7.436696857922201e-08, "logits/chosen": -2.057633399963379, "logits/rejected": -2.362816095352173, "logps/chosen": -2.4645392894744873, "logps/rejected": -2.540754556655884, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9554118514060974, "rewards/margins": 0.005629360675811768, "rewards/rejected": 0.9497824907302856, "step": 2654 }, { "epoch": 1.43, "learning_rate": 7.43478971715351e-08, "logits/chosen": -2.2121076583862305, "logits/rejected": -2.3395211696624756, "logps/chosen": -0.5057756304740906, "logps/rejected": -0.5322766304016113, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 1.0385782718658447, "rewards/margins": -0.0037914514541625977, "rewards/rejected": 1.0423697233200073, "step": 2655 }, { "epoch": 1.43, "learning_rate": 7.432882111932561e-08, "logits/chosen": -1.9951051473617554, "logits/rejected": -2.221968173980713, "logps/chosen": -0.8666014075279236, "logps/rejected": -0.8274928331375122, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0423979759216309, "rewards/margins": 0.019207000732421875, "rewards/rejected": 1.023190975189209, "step": 2656 }, { "epoch": 1.43, "learning_rate": 7.430974042623246e-08, "logits/chosen": -1.9498610496520996, "logits/rejected": -1.9510297775268555, "logps/chosen": -1.6397744417190552, "logps/rejected": -0.8079332709312439, "loss": 0.6286, "rewards/accuracies": 1.0, "rewards/chosen": 1.07516348361969, "rewards/margins": 0.1334589123725891, "rewards/rejected": 0.9417045712471008, "step": 2657 }, { "epoch": 1.43, "learning_rate": 7.42906550958954e-08, "logits/chosen": -1.9726710319519043, "logits/rejected": -2.2821044921875, "logps/chosen": -0.43638014793395996, "logps/rejected": -0.5258533358573914, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 1.0045791864395142, "rewards/margins": 0.028571248054504395, "rewards/rejected": 0.9760079383850098, "step": 2658 }, { "epoch": 1.43, "learning_rate": 7.427156513195508e-08, "logits/chosen": -2.1168603897094727, "logits/rejected": -2.1188673973083496, "logps/chosen": -0.24054884910583496, "logps/rejected": -5.391560077667236, "loss": 0.4543, "rewards/accuracies": 1.0, "rewards/chosen": 0.9257287979125977, "rewards/margins": 0.5531258583068848, "rewards/rejected": 0.3726029098033905, "step": 2659 }, { "epoch": 1.43, "learning_rate": 7.425247053805304e-08, "logits/chosen": -2.095982551574707, "logits/rejected": -2.10990047454834, "logps/chosen": -4.985615253448486, "logps/rejected": -3.5860230922698975, "loss": 0.5539, "rewards/accuracies": 1.0, "rewards/chosen": 1.0998210906982422, "rewards/margins": 0.30101507902145386, "rewards/rejected": 0.7988060116767883, "step": 2660 }, { "epoch": 1.44, "learning_rate": 7.423337131783172e-08, "logits/chosen": -2.1115050315856934, "logits/rejected": -2.114434003829956, "logps/chosen": -3.2293107509613037, "logps/rejected": -1.3065705299377441, "loss": 0.6047, "rewards/accuracies": 1.0, "rewards/chosen": 1.3711200952529907, "rewards/margins": 0.1854231357574463, "rewards/rejected": 1.1856969594955444, "step": 2661 }, { "epoch": 1.44, "learning_rate": 7.421426747493439e-08, "logits/chosen": -2.0583229064941406, "logits/rejected": -2.272280216217041, "logps/chosen": -0.4781809151172638, "logps/rejected": -5.227844715118408, "loss": 0.5536, "rewards/accuracies": 1.0, "rewards/chosen": 0.9928951263427734, "rewards/margins": 0.30182862281799316, "rewards/rejected": 0.6910665035247803, "step": 2662 }, { "epoch": 1.44, "learning_rate": 7.419515901300524e-08, "logits/chosen": -2.106844425201416, "logits/rejected": -2.1008307933807373, "logps/chosen": -2.363723039627075, "logps/rejected": -3.56280517578125, "loss": 0.5259, "rewards/accuracies": 1.0, "rewards/chosen": 1.2714561223983765, "rewards/margins": 0.36828964948654175, "rewards/rejected": 0.9031664729118347, "step": 2663 }, { "epoch": 1.44, "learning_rate": 7.417604593568935e-08, "logits/chosen": -2.051542043685913, "logits/rejected": -2.2952370643615723, "logps/chosen": -6.450350284576416, "logps/rejected": -0.7349936962127686, "loss": 0.7688, "rewards/accuracies": 0.0, "rewards/chosen": 0.7986273169517517, "rewards/margins": -0.1460530161857605, "rewards/rejected": 0.9446803331375122, "step": 2664 }, { "epoch": 1.44, "learning_rate": 7.415692824663267e-08, "logits/chosen": -2.0917320251464844, "logits/rejected": -2.285640239715576, "logps/chosen": -0.812092661857605, "logps/rejected": -0.916983425617218, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.901257336139679, "rewards/margins": 0.03434336185455322, "rewards/rejected": 0.8669139742851257, "step": 2665 }, { "epoch": 1.44, "learning_rate": 7.4137805949482e-08, "logits/chosen": -1.947198748588562, "logits/rejected": -1.9166960716247559, "logps/chosen": -10.918283462524414, "logps/rejected": -4.927316665649414, "loss": 0.4795, "rewards/accuracies": 1.0, "rewards/chosen": 0.9116619229316711, "rewards/margins": 0.48567983508110046, "rewards/rejected": 0.4259820878505707, "step": 2666 }, { "epoch": 1.44, "learning_rate": 7.411867904788507e-08, "logits/chosen": -2.041806936264038, "logits/rejected": -2.3103952407836914, "logps/chosen": -0.47839075326919556, "logps/rejected": -0.512536883354187, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8337279558181763, "rewards/margins": 0.011117875576019287, "rewards/rejected": 0.822610080242157, "step": 2667 }, { "epoch": 1.44, "learning_rate": 7.409954754549046e-08, "logits/chosen": -1.9544998407363892, "logits/rejected": -1.9547388553619385, "logps/chosen": -2.8663601875305176, "logps/rejected": -0.6005645990371704, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.7786905169487, "rewards/margins": 0.0020088553428649902, "rewards/rejected": 0.776681661605835, "step": 2668 }, { "epoch": 1.44, "learning_rate": 7.408041144594761e-08, "logits/chosen": -2.013054132461548, "logits/rejected": -2.01965069770813, "logps/chosen": -1.1074955463409424, "logps/rejected": -3.4397714138031006, "loss": 0.4682, "rewards/accuracies": 1.0, "rewards/chosen": 1.0261327028274536, "rewards/margins": 0.5156094431877136, "rewards/rejected": 0.51052325963974, "step": 2669 }, { "epoch": 1.44, "learning_rate": 7.406127075290688e-08, "logits/chosen": -2.0319254398345947, "logits/rejected": -2.0370121002197266, "logps/chosen": -0.4868229627609253, "logps/rejected": -4.015171051025391, "loss": 0.5026, "rewards/accuracies": 1.0, "rewards/chosen": 0.834746778011322, "rewards/margins": 0.426057368516922, "rewards/rejected": 0.4086894094944, "step": 2670 }, { "epoch": 1.44, "learning_rate": 7.404212547001947e-08, "logits/chosen": -2.1518490314483643, "logits/rejected": -2.2715659141540527, "logps/chosen": -1.6784676313400269, "logps/rejected": -1.5495723485946655, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.6220621466636658, "rewards/margins": -0.002718687057495117, "rewards/rejected": 0.6247808337211609, "step": 2671 }, { "epoch": 1.44, "learning_rate": 7.402297560093749e-08, "logits/chosen": -2.151275396347046, "logits/rejected": -2.0524327754974365, "logps/chosen": -30.157062530517578, "logps/rejected": -5.916554927825928, "loss": 0.1896, "rewards/accuracies": 1.0, "rewards/chosen": 2.14313006401062, "rewards/margins": 1.566720724105835, "rewards/rejected": 0.5764093995094299, "step": 2672 }, { "epoch": 1.44, "learning_rate": 7.400382114931386e-08, "logits/chosen": -2.1544649600982666, "logits/rejected": -2.275381088256836, "logps/chosen": -0.25231388211250305, "logps/rejected": -0.2460680603981018, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9490920305252075, "rewards/margins": 0.013363838195800781, "rewards/rejected": 0.9357281923294067, "step": 2673 }, { "epoch": 1.44, "learning_rate": 7.398466211880246e-08, "logits/chosen": -2.0700249671936035, "logits/rejected": -2.277799606323242, "logps/chosen": -0.4041774272918701, "logps/rejected": -0.3628208637237549, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.9185354113578796, "rewards/margins": 0.020940780639648438, "rewards/rejected": 0.8975946307182312, "step": 2674 }, { "epoch": 1.44, "learning_rate": 7.3965498513058e-08, "logits/chosen": -2.0752906799316406, "logits/rejected": -2.078470468521118, "logps/chosen": -3.4885787963867188, "logps/rejected": -14.157732009887695, "loss": 0.2678, "rewards/accuracies": 1.0, "rewards/chosen": 1.201461672782898, "rewards/margins": 1.1807186603546143, "rewards/rejected": 0.02074298821389675, "step": 2675 }, { "epoch": 1.44, "learning_rate": 7.394633033573605e-08, "logits/chosen": -2.2048823833465576, "logits/rejected": -2.169818878173828, "logps/chosen": -29.2041072845459, "logps/rejected": -1.9458990097045898, "loss": 0.4053, "rewards/accuracies": 1.0, "rewards/chosen": 1.7200788259506226, "rewards/margins": 0.6937389373779297, "rewards/rejected": 1.0263398885726929, "step": 2676 }, { "epoch": 1.44, "learning_rate": 7.392715759049308e-08, "logits/chosen": -1.993709683418274, "logits/rejected": -2.305157423019409, "logps/chosen": -0.4024985134601593, "logps/rejected": -0.49937137961387634, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 1.00251042842865, "rewards/margins": 0.025106608867645264, "rewards/rejected": 0.9774038195610046, "step": 2677 }, { "epoch": 1.44, "learning_rate": 7.390798028098641e-08, "logits/chosen": -2.1453585624694824, "logits/rejected": -2.2882964611053467, "logps/chosen": -5.625917911529541, "logps/rejected": -5.438967704772949, "loss": 0.6562, "rewards/accuracies": 1.0, "rewards/chosen": 0.6122618317604065, "rewards/margins": 0.07529723644256592, "rewards/rejected": 0.5369645953178406, "step": 2678 }, { "epoch": 1.44, "learning_rate": 7.388879841087422e-08, "logits/chosen": -2.135582685470581, "logits/rejected": -2.1976962089538574, "logps/chosen": -7.371032238006592, "logps/rejected": -17.488121032714844, "loss": 0.541, "rewards/accuracies": 1.0, "rewards/chosen": 1.4664157629013062, "rewards/margins": 0.33158910274505615, "rewards/rejected": 1.13482666015625, "step": 2679 }, { "epoch": 1.45, "learning_rate": 7.38696119838156e-08, "logits/chosen": -2.189220905303955, "logits/rejected": -2.3060855865478516, "logps/chosen": -2.675119638442993, "logps/rejected": -2.8292911052703857, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.9323163032531738, "rewards/margins": 0.014808058738708496, "rewards/rejected": 0.9175082445144653, "step": 2680 }, { "epoch": 1.45, "learning_rate": 7.38504210034705e-08, "logits/chosen": -2.0555028915405273, "logits/rejected": -2.0514657497406006, "logps/chosen": -2.452489137649536, "logps/rejected": -3.483234405517578, "loss": 0.5573, "rewards/accuracies": 1.0, "rewards/chosen": 0.9241113066673279, "rewards/margins": 0.2930736541748047, "rewards/rejected": 0.6310376524925232, "step": 2681 }, { "epoch": 1.45, "learning_rate": 7.38312254734997e-08, "logits/chosen": -2.05680775642395, "logits/rejected": -2.059648036956787, "logps/chosen": -0.8759604096412659, "logps/rejected": -4.422675132751465, "loss": 0.5459, "rewards/accuracies": 1.0, "rewards/chosen": 0.9300031661987305, "rewards/margins": 0.3198789358139038, "rewards/rejected": 0.6101242303848267, "step": 2682 }, { "epoch": 1.45, "learning_rate": 7.381202539756489e-08, "logits/chosen": -2.111560821533203, "logits/rejected": -2.135958671569824, "logps/chosen": -8.363402366638184, "logps/rejected": -1.8616230487823486, "loss": 0.4293, "rewards/accuracies": 1.0, "rewards/chosen": 1.5179930925369263, "rewards/margins": 0.6231527328491211, "rewards/rejected": 0.8948403596878052, "step": 2683 }, { "epoch": 1.45, "learning_rate": 7.37928207793286e-08, "logits/chosen": -1.9856764078140259, "logits/rejected": -1.9929685592651367, "logps/chosen": -1.8721641302108765, "logps/rejected": -4.13901424407959, "loss": 0.4368, "rewards/accuracies": 1.0, "rewards/chosen": 1.1252844333648682, "rewards/margins": 0.6020211577415466, "rewards/rejected": 0.5232632756233215, "step": 2684 }, { "epoch": 1.45, "learning_rate": 7.377361162245426e-08, "logits/chosen": -2.0493111610412598, "logits/rejected": -2.285792827606201, "logps/chosen": -0.31368815898895264, "logps/rejected": -0.3062366843223572, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": 0.9738757014274597, "rewards/margins": 0.047020792961120605, "rewards/rejected": 0.9268549084663391, "step": 2685 }, { "epoch": 1.45, "learning_rate": 7.375439793060611e-08, "logits/chosen": -1.9883478879928589, "logits/rejected": -1.9870177507400513, "logps/chosen": -2.110844850540161, "logps/rejected": -8.752004623413086, "loss": 0.2846, "rewards/accuracies": 1.0, "rewards/chosen": 1.4767262935638428, "rewards/margins": 1.110967755317688, "rewards/rejected": 0.3657585084438324, "step": 2686 }, { "epoch": 1.45, "learning_rate": 7.373517970744933e-08, "logits/chosen": -2.0320920944213867, "logits/rejected": -2.0251219272613525, "logps/chosen": -5.007343292236328, "logps/rejected": -2.535799503326416, "loss": 0.4636, "rewards/accuracies": 1.0, "rewards/chosen": 1.2757495641708374, "rewards/margins": 0.5280248522758484, "rewards/rejected": 0.747724711894989, "step": 2687 }, { "epoch": 1.45, "learning_rate": 7.37159569566499e-08, "logits/chosen": -2.1148531436920166, "logits/rejected": -2.109954833984375, "logps/chosen": -3.9925601482391357, "logps/rejected": -1.7994743585586548, "loss": 0.3707, "rewards/accuracies": 1.0, "rewards/chosen": 1.6132456064224243, "rewards/margins": 0.8014366030693054, "rewards/rejected": 0.8118090033531189, "step": 2688 }, { "epoch": 1.45, "learning_rate": 7.369672968187466e-08, "logits/chosen": -2.0414249897003174, "logits/rejected": -2.0406382083892822, "logps/chosen": -2.0876970291137695, "logps/rejected": -0.7608816027641296, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": 0.8757039904594421, "rewards/margins": 0.1541285514831543, "rewards/rejected": 0.7215754389762878, "step": 2689 }, { "epoch": 1.45, "learning_rate": 7.367749788679138e-08, "logits/chosen": -2.1403515338897705, "logits/rejected": -2.3020970821380615, "logps/chosen": -3.122790575027466, "logps/rejected": -3.005361557006836, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.9911521077156067, "rewards/margins": -0.007460296154022217, "rewards/rejected": 0.9986124038696289, "step": 2690 }, { "epoch": 1.45, "learning_rate": 7.365826157506866e-08, "logits/chosen": -2.058725357055664, "logits/rejected": -2.060394048690796, "logps/chosen": -2.1338183879852295, "logps/rejected": -1.175034523010254, "loss": 0.5338, "rewards/accuracies": 1.0, "rewards/chosen": 1.2699164152145386, "rewards/margins": 0.34893089532852173, "rewards/rejected": 0.9209855198860168, "step": 2691 }, { "epoch": 1.45, "learning_rate": 7.363902075037592e-08, "logits/chosen": -2.056715726852417, "logits/rejected": -2.061734199523926, "logps/chosen": -3.452507495880127, "logps/rejected": -4.642768383026123, "loss": 0.378, "rewards/accuracies": 1.0, "rewards/chosen": 1.3086169958114624, "rewards/margins": 0.7778884768486023, "rewards/rejected": 0.5307285189628601, "step": 2692 }, { "epoch": 1.45, "learning_rate": 7.36197754163835e-08, "logits/chosen": -2.151766777038574, "logits/rejected": -2.151418447494507, "logps/chosen": -0.7821289896965027, "logps/rejected": -4.4655375480651855, "loss": 0.3855, "rewards/accuracies": 1.0, "rewards/chosen": 1.0883564949035645, "rewards/margins": 0.7542430758476257, "rewards/rejected": 0.3341134190559387, "step": 2693 }, { "epoch": 1.45, "learning_rate": 7.360052557676256e-08, "logits/chosen": -2.125553607940674, "logits/rejected": -2.2645483016967773, "logps/chosen": -3.026292085647583, "logps/rejected": -3.014761447906494, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.9247415661811829, "rewards/margins": 0.015634894371032715, "rewards/rejected": 0.9091066718101501, "step": 2694 }, { "epoch": 1.45, "learning_rate": 7.358127123518512e-08, "logits/chosen": -2.0696377754211426, "logits/rejected": -2.075559616088867, "logps/chosen": -2.7013137340545654, "logps/rejected": -4.397591590881348, "loss": 0.4177, "rewards/accuracies": 1.0, "rewards/chosen": 1.2114099264144897, "rewards/margins": 0.6568928360939026, "rewards/rejected": 0.5545170903205872, "step": 2695 }, { "epoch": 1.45, "learning_rate": 7.35620123953241e-08, "logits/chosen": -2.044241189956665, "logits/rejected": -2.2678277492523193, "logps/chosen": -2.192936420440674, "logps/rejected": -1.5386829376220703, "loss": 0.5898, "rewards/accuracies": 1.0, "rewards/chosen": 1.0860432386398315, "rewards/margins": 0.21862608194351196, "rewards/rejected": 0.8674171566963196, "step": 2696 }, { "epoch": 1.45, "learning_rate": 7.354274906085324e-08, "logits/chosen": -2.1650636196136475, "logits/rejected": -2.1668758392333984, "logps/chosen": -0.7832576036453247, "logps/rejected": -3.547067642211914, "loss": 0.46, "rewards/accuracies": 1.0, "rewards/chosen": 1.0794986486434937, "rewards/margins": 0.5377683043479919, "rewards/rejected": 0.5417303442955017, "step": 2697 }, { "epoch": 1.46, "learning_rate": 7.352348123544717e-08, "logits/chosen": -2.0781545639038086, "logits/rejected": -2.286151647567749, "logps/chosen": -0.3361681401729584, "logps/rejected": -0.36156463623046875, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.9156109094619751, "rewards/margins": 0.015978991985321045, "rewards/rejected": 0.899631917476654, "step": 2698 }, { "epoch": 1.46, "learning_rate": 7.350420892278132e-08, "logits/chosen": -2.043020486831665, "logits/rejected": -2.2553067207336426, "logps/chosen": -2.555046796798706, "logps/rejected": -2.7624154090881348, "loss": 0.6598, "rewards/accuracies": 1.0, "rewards/chosen": 0.6846356391906738, "rewards/margins": 0.06785339117050171, "rewards/rejected": 0.6167822480201721, "step": 2699 }, { "epoch": 1.46, "learning_rate": 7.348493212653204e-08, "logits/chosen": -2.056180477142334, "logits/rejected": -2.0633625984191895, "logps/chosen": -1.218468189239502, "logps/rejected": -12.69890308380127, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 1.105294108390808, "rewards/margins": 0.3281016945838928, "rewards/rejected": 0.7771924138069153, "step": 2700 }, { "epoch": 1.46, "learning_rate": 7.346565085037648e-08, "logits/chosen": -2.125645875930786, "logits/rejected": -2.2762653827667236, "logps/chosen": -3.1996877193450928, "logps/rejected": -3.090366840362549, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.9917283058166504, "rewards/margins": -0.0056639909744262695, "rewards/rejected": 0.9973922967910767, "step": 2701 }, { "epoch": 1.46, "learning_rate": 7.34463650979927e-08, "logits/chosen": -2.029491662979126, "logits/rejected": -2.257136106491089, "logps/chosen": -2.6032845973968506, "logps/rejected": -2.768112897872925, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.7976542711257935, "rewards/margins": 0.03207969665527344, "rewards/rejected": 0.76557457447052, "step": 2702 }, { "epoch": 1.46, "learning_rate": 7.342707487305956e-08, "logits/chosen": -2.155183792114258, "logits/rejected": -2.1876041889190674, "logps/chosen": -2.752868890762329, "logps/rejected": -12.064353942871094, "loss": 0.4511, "rewards/accuracies": 1.0, "rewards/chosen": 1.2857674360275269, "rewards/margins": 0.5620556473731995, "rewards/rejected": 0.7237117886543274, "step": 2703 }, { "epoch": 1.46, "learning_rate": 7.34077801792568e-08, "logits/chosen": -1.956956386566162, "logits/rejected": -2.2415268421173096, "logps/chosen": -1.2831374406814575, "logps/rejected": -1.403342366218567, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.9081912040710449, "rewards/margins": 0.02198892831802368, "rewards/rejected": 0.8862022757530212, "step": 2704 }, { "epoch": 1.46, "learning_rate": 7.338848102026503e-08, "logits/chosen": -2.0346503257751465, "logits/rejected": -2.0393948554992676, "logps/chosen": -0.8991733193397522, "logps/rejected": -12.151115417480469, "loss": 0.4919, "rewards/accuracies": 1.0, "rewards/chosen": 1.0844868421554565, "rewards/margins": 0.4534807801246643, "rewards/rejected": 0.6310060620307922, "step": 2705 }, { "epoch": 1.46, "learning_rate": 7.336917739976568e-08, "logits/chosen": -2.111672878265381, "logits/rejected": -2.2532670497894287, "logps/chosen": -3.592939853668213, "logps/rejected": -3.5418524742126465, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.3654298484325409, "rewards/margins": 0.02220773696899414, "rewards/rejected": 0.34322211146354675, "step": 2706 }, { "epoch": 1.46, "learning_rate": 7.334986932144106e-08, "logits/chosen": -1.9937382936477661, "logits/rejected": -2.3185830116271973, "logps/chosen": -0.19346493482589722, "logps/rejected": -0.2119522988796234, "loss": 0.699, "rewards/accuracies": 0.0, "rewards/chosen": 0.992074191570282, "rewards/margins": -0.011737525463104248, "rewards/rejected": 1.0038117170333862, "step": 2707 }, { "epoch": 1.46, "learning_rate": 7.333055678897427e-08, "logits/chosen": -2.0993094444274902, "logits/rejected": -2.237342596054077, "logps/chosen": -2.0013511180877686, "logps/rejected": -0.6208349466323853, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0421319007873535, "rewards/margins": 0.03007960319519043, "rewards/rejected": 1.012052297592163, "step": 2708 }, { "epoch": 1.46, "learning_rate": 7.331123980604934e-08, "logits/chosen": -2.0006966590881348, "logits/rejected": -2.3169362545013428, "logps/chosen": -3.1514782905578613, "logps/rejected": -2.9126832485198975, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.46715646982192993, "rewards/margins": 0.02142682671546936, "rewards/rejected": 0.44572964310646057, "step": 2709 }, { "epoch": 1.46, "learning_rate": 7.329191837635111e-08, "logits/chosen": -2.0656633377075195, "logits/rejected": -2.0660786628723145, "logps/chosen": -2.1450867652893066, "logps/rejected": -1.1567323207855225, "loss": 0.5868, "rewards/accuracies": 1.0, "rewards/chosen": 1.1505587100982666, "rewards/margins": 0.22542089223861694, "rewards/rejected": 0.9251378178596497, "step": 2710 }, { "epoch": 1.46, "learning_rate": 7.327259250356526e-08, "logits/chosen": -2.151153087615967, "logits/rejected": -2.1509127616882324, "logps/chosen": -1.1452147960662842, "logps/rejected": -6.045307636260986, "loss": 0.6227, "rewards/accuracies": 1.0, "rewards/chosen": 1.0755330324172974, "rewards/margins": 0.1461976170539856, "rewards/rejected": 0.9293354153633118, "step": 2711 }, { "epoch": 1.46, "learning_rate": 7.325326219137833e-08, "logits/chosen": -1.969694972038269, "logits/rejected": -1.974809169769287, "logps/chosen": -0.24951055645942688, "logps/rejected": -6.8708391189575195, "loss": 0.4762, "rewards/accuracies": 1.0, "rewards/chosen": 0.8279364705085754, "rewards/margins": 0.49430105090141296, "rewards/rejected": 0.3336354196071625, "step": 2712 }, { "epoch": 1.46, "learning_rate": 7.323392744347771e-08, "logits/chosen": -2.1059327125549316, "logits/rejected": -2.2462716102600098, "logps/chosen": -4.924286842346191, "logps/rejected": -3.674766778945923, "loss": 0.5944, "rewards/accuracies": 1.0, "rewards/chosen": 1.0826338529586792, "rewards/margins": 0.20843291282653809, "rewards/rejected": 0.8742009401321411, "step": 2713 }, { "epoch": 1.46, "learning_rate": 7.321458826355164e-08, "logits/chosen": -2.1648905277252197, "logits/rejected": -2.1643755435943604, "logps/chosen": -1.785083293914795, "logps/rejected": -8.064163208007812, "loss": 0.5667, "rewards/accuracies": 1.0, "rewards/chosen": 1.1623672246932983, "rewards/margins": 0.27127718925476074, "rewards/rejected": 0.8910900354385376, "step": 2714 }, { "epoch": 1.46, "learning_rate": 7.319524465528919e-08, "logits/chosen": -2.208496570587158, "logits/rejected": -2.3170769214630127, "logps/chosen": -0.48395204544067383, "logps/rejected": -0.4444436728954315, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 1.0486501455307007, "rewards/margins": 0.0199507474899292, "rewards/rejected": 1.0286993980407715, "step": 2715 }, { "epoch": 1.46, "learning_rate": 7.317589662238025e-08, "logits/chosen": -2.163062334060669, "logits/rejected": -2.049823522567749, "logps/chosen": -27.874467849731445, "logps/rejected": -3.328003168106079, "loss": 0.2475, "rewards/accuracies": 1.0, "rewards/chosen": 1.790114402770996, "rewards/margins": 1.2701151371002197, "rewards/rejected": 0.5199993252754211, "step": 2716 }, { "epoch": 1.47, "learning_rate": 7.315654416851563e-08, "logits/chosen": -1.9915000200271606, "logits/rejected": -2.275364398956299, "logps/chosen": -1.262475609779358, "logps/rejected": -4.417236804962158, "loss": 0.6099, "rewards/accuracies": 1.0, "rewards/chosen": 0.9955543875694275, "rewards/margins": 0.1740536093711853, "rewards/rejected": 0.8215007781982422, "step": 2717 }, { "epoch": 1.47, "learning_rate": 7.313718729738691e-08, "logits/chosen": -2.1032204627990723, "logits/rejected": -2.099691867828369, "logps/chosen": -5.513660430908203, "logps/rejected": -3.481792688369751, "loss": 0.4198, "rewards/accuracies": 1.0, "rewards/chosen": 1.3183443546295166, "rewards/margins": 0.6507648825645447, "rewards/rejected": 0.6675794720649719, "step": 2718 }, { "epoch": 1.47, "learning_rate": 7.311782601268655e-08, "logits/chosen": -2.064309597015381, "logits/rejected": -2.0530073642730713, "logps/chosen": -13.246834754943848, "logps/rejected": -3.6444084644317627, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 1.3649604320526123, "rewards/margins": 0.8698328733444214, "rewards/rejected": 0.4951275885105133, "step": 2719 }, { "epoch": 1.47, "learning_rate": 7.309846031810785e-08, "logits/chosen": -2.166457176208496, "logits/rejected": -2.172175168991089, "logps/chosen": -2.1138622760772705, "logps/rejected": -3.866334915161133, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 0.9247552156448364, "rewards/margins": 0.4384695291519165, "rewards/rejected": 0.4862856864929199, "step": 2720 }, { "epoch": 1.47, "learning_rate": 7.307909021734495e-08, "logits/chosen": -2.185180902481079, "logits/rejected": -2.2818491458892822, "logps/chosen": -0.7612696886062622, "logps/rejected": -0.7919518351554871, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9233710169792175, "rewards/margins": 0.013563394546508789, "rewards/rejected": 0.9098076224327087, "step": 2721 }, { "epoch": 1.47, "learning_rate": 7.305971571409281e-08, "logits/chosen": -2.098775863647461, "logits/rejected": -2.045151948928833, "logps/chosen": -29.22691535949707, "logps/rejected": -3.958791732788086, "loss": 0.2974, "rewards/accuracies": 1.0, "rewards/chosen": 1.6520437002182007, "rewards/margins": 1.0604369640350342, "rewards/rejected": 0.5916067957878113, "step": 2722 }, { "epoch": 1.47, "learning_rate": 7.304033681204726e-08, "logits/chosen": -2.091054677963257, "logits/rejected": -2.100295066833496, "logps/chosen": -0.7105486392974854, "logps/rejected": -11.797627449035645, "loss": 0.5859, "rewards/accuracies": 1.0, "rewards/chosen": 1.006418228149414, "rewards/margins": 0.2273152470588684, "rewards/rejected": 0.7791029810905457, "step": 2723 }, { "epoch": 1.47, "learning_rate": 7.302095351490493e-08, "logits/chosen": -2.14745831489563, "logits/rejected": -2.1320929527282715, "logps/chosen": -10.110490798950195, "logps/rejected": -2.9531047344207764, "loss": 0.5438, "rewards/accuracies": 1.0, "rewards/chosen": 1.0465278625488281, "rewards/margins": 0.32500362396240234, "rewards/rejected": 0.7215242385864258, "step": 2724 }, { "epoch": 1.47, "learning_rate": 7.300156582636332e-08, "logits/chosen": -2.0580122470855713, "logits/rejected": -2.0565345287323, "logps/chosen": -0.47406265139579773, "logps/rejected": -3.348145008087158, "loss": 0.5169, "rewards/accuracies": 1.0, "rewards/chosen": 0.9983534216880798, "rewards/margins": 0.3903416395187378, "rewards/rejected": 0.608011782169342, "step": 2725 }, { "epoch": 1.47, "learning_rate": 7.298217375012076e-08, "logits/chosen": -2.196167230606079, "logits/rejected": -2.183004856109619, "logps/chosen": -11.098841667175293, "logps/rejected": -2.206752061843872, "loss": 0.5111, "rewards/accuracies": 1.0, "rewards/chosen": 1.477079153060913, "rewards/margins": 0.40475380420684814, "rewards/rejected": 1.072325348854065, "step": 2726 }, { "epoch": 1.47, "learning_rate": 7.296277728987645e-08, "logits/chosen": -2.0851333141326904, "logits/rejected": -2.0695788860321045, "logps/chosen": -6.181439399719238, "logps/rejected": -6.866267681121826, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": 1.3108817338943481, "rewards/margins": 1.0028847455978394, "rewards/rejected": 0.3079969882965088, "step": 2727 }, { "epoch": 1.47, "learning_rate": 7.294337644933035e-08, "logits/chosen": -2.0995070934295654, "logits/rejected": -2.2560997009277344, "logps/chosen": -0.34416359663009644, "logps/rejected": -0.36335912346839905, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.7670479416847229, "rewards/margins": 0.009688973426818848, "rewards/rejected": 0.757358968257904, "step": 2728 }, { "epoch": 1.47, "learning_rate": 7.29239712321833e-08, "logits/chosen": -2.0725109577178955, "logits/rejected": -2.047928810119629, "logps/chosen": -6.279146194458008, "logps/rejected": -3.614429473876953, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 1.487129807472229, "rewards/margins": 0.8636860847473145, "rewards/rejected": 0.6234437227249146, "step": 2729 }, { "epoch": 1.47, "learning_rate": 7.290456164213701e-08, "logits/chosen": -1.959091305732727, "logits/rejected": -1.9545178413391113, "logps/chosen": -3.999614953994751, "logps/rejected": -5.153509140014648, "loss": 0.2736, "rewards/accuracies": 1.0, "rewards/chosen": 1.6794706583023071, "rewards/margins": 1.1562750339508057, "rewards/rejected": 0.5231955647468567, "step": 2730 }, { "epoch": 1.47, "learning_rate": 7.288514768289397e-08, "logits/chosen": -1.9646984338760376, "logits/rejected": -1.9692771434783936, "logps/chosen": -1.3222436904907227, "logps/rejected": -4.15105676651001, "loss": 0.6286, "rewards/accuracies": 1.0, "rewards/chosen": 1.163002848625183, "rewards/margins": 0.13352322578430176, "rewards/rejected": 1.0294796228408813, "step": 2731 }, { "epoch": 1.47, "learning_rate": 7.286572935815751e-08, "logits/chosen": -2.1780102252960205, "logits/rejected": -2.1755378246307373, "logps/chosen": -3.6174182891845703, "logps/rejected": -4.284318923950195, "loss": 0.2507, "rewards/accuracies": 1.0, "rewards/chosen": 1.7219761610031128, "rewards/margins": 1.2554700374603271, "rewards/rejected": 0.46650609374046326, "step": 2732 }, { "epoch": 1.47, "learning_rate": 7.284630667163181e-08, "logits/chosen": -2.121185302734375, "logits/rejected": -2.1227493286132812, "logps/chosen": -0.2828092575073242, "logps/rejected": -4.942292213439941, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 1.0514472723007202, "rewards/margins": 0.651187539100647, "rewards/rejected": 0.40025970339775085, "step": 2733 }, { "epoch": 1.47, "learning_rate": 7.282687962702185e-08, "logits/chosen": -2.096240997314453, "logits/rejected": -2.2659237384796143, "logps/chosen": -2.246077537536621, "logps/rejected": -6.977089881896973, "loss": 0.5601, "rewards/accuracies": 1.0, "rewards/chosen": 0.7145482301712036, "rewards/margins": 0.2864852249622345, "rewards/rejected": 0.4280630052089691, "step": 2734 }, { "epoch": 1.48, "learning_rate": 7.280744822803353e-08, "logits/chosen": -2.0236001014709473, "logits/rejected": -2.0141470432281494, "logps/chosen": -6.490354537963867, "logps/rejected": -14.966550827026367, "loss": 0.4544, "rewards/accuracies": 1.0, "rewards/chosen": 1.253485918045044, "rewards/margins": 0.5530319809913635, "rewards/rejected": 0.7004539370536804, "step": 2735 }, { "epoch": 1.48, "learning_rate": 7.278801247837347e-08, "logits/chosen": -2.124157428741455, "logits/rejected": -2.130340337753296, "logps/chosen": -3.544482946395874, "logps/rejected": -2.377288579940796, "loss": 0.4346, "rewards/accuracies": 1.0, "rewards/chosen": 1.1917681694030762, "rewards/margins": 0.6081185340881348, "rewards/rejected": 0.5836496353149414, "step": 2736 }, { "epoch": 1.48, "learning_rate": 7.276857238174918e-08, "logits/chosen": -2.1360924243927, "logits/rejected": -2.1208951473236084, "logps/chosen": -8.175579071044922, "logps/rejected": -6.034781455993652, "loss": 0.3627, "rewards/accuracies": 1.0, "rewards/chosen": 1.4230502843856812, "rewards/margins": 0.8273252844810486, "rewards/rejected": 0.5957249999046326, "step": 2737 }, { "epoch": 1.48, "learning_rate": 7.2749127941869e-08, "logits/chosen": -2.198101282119751, "logits/rejected": -2.093219518661499, "logps/chosen": -19.882049560546875, "logps/rejected": -2.436082363128662, "loss": 0.243, "rewards/accuracies": 1.0, "rewards/chosen": 2.005389451980591, "rewards/margins": 1.2906324863433838, "rewards/rejected": 0.7147569060325623, "step": 2738 }, { "epoch": 1.48, "learning_rate": 7.272967916244206e-08, "logits/chosen": -2.0606017112731934, "logits/rejected": -2.2233314514160156, "logps/chosen": -4.136742115020752, "logps/rejected": -3.209364652633667, "loss": 0.7531, "rewards/accuracies": 0.0, "rewards/chosen": 0.8302850723266602, "rewards/margins": -0.11647504568099976, "rewards/rejected": 0.9467601180076599, "step": 2739 }, { "epoch": 1.48, "learning_rate": 7.271022604717835e-08, "logits/chosen": -2.045759916305542, "logits/rejected": -2.0535855293273926, "logps/chosen": -1.3654718399047852, "logps/rejected": -3.2792251110076904, "loss": 0.475, "rewards/accuracies": 1.0, "rewards/chosen": 1.0489709377288818, "rewards/margins": 0.4976295828819275, "rewards/rejected": 0.5513413548469543, "step": 2740 }, { "epoch": 1.48, "learning_rate": 7.269076859978869e-08, "logits/chosen": -2.004995107650757, "logits/rejected": -2.294593572616577, "logps/chosen": -2.85312557220459, "logps/rejected": -3.4642605781555176, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.9758456349372864, "rewards/margins": 0.040582358837127686, "rewards/rejected": 0.9352632761001587, "step": 2741 }, { "epoch": 1.48, "learning_rate": 7.267130682398472e-08, "logits/chosen": -1.9243658781051636, "logits/rejected": -1.9308902025222778, "logps/chosen": -3.156231641769409, "logps/rejected": -4.1219706535339355, "loss": 0.5299, "rewards/accuracies": 1.0, "rewards/chosen": 0.9961070418357849, "rewards/margins": 0.35835540294647217, "rewards/rejected": 0.6377516388893127, "step": 2742 }, { "epoch": 1.48, "learning_rate": 7.265184072347891e-08, "logits/chosen": -2.0286784172058105, "logits/rejected": -2.273594856262207, "logps/chosen": -12.761724472045898, "logps/rejected": -7.779045104980469, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.8058988451957703, "rewards/margins": 0.032978057861328125, "rewards/rejected": 0.7729207873344421, "step": 2743 }, { "epoch": 1.48, "learning_rate": 7.263237030198451e-08, "logits/chosen": -2.0363805294036865, "logits/rejected": -2.3267791271209717, "logps/chosen": -0.6852965950965881, "logps/rejected": -0.603160560131073, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 1.088265299797058, "rewards/margins": 0.009397149085998535, "rewards/rejected": 1.0788681507110596, "step": 2744 }, { "epoch": 1.48, "learning_rate": 7.261289556321566e-08, "logits/chosen": -2.127079486846924, "logits/rejected": -2.1222479343414307, "logps/chosen": -2.4205212593078613, "logps/rejected": -3.6652915477752686, "loss": 0.8103, "rewards/accuracies": 0.0, "rewards/chosen": 0.7721299529075623, "rewards/margins": -0.2220081090927124, "rewards/rejected": 0.9941380620002747, "step": 2745 }, { "epoch": 1.48, "learning_rate": 7.259341651088731e-08, "logits/chosen": -1.939113736152649, "logits/rejected": -1.9379324913024902, "logps/chosen": -1.368468165397644, "logps/rejected": -1.2765864133834839, "loss": 0.7051, "rewards/accuracies": 0.0, "rewards/chosen": 0.9493860602378845, "rewards/margins": -0.023674845695495605, "rewards/rejected": 0.9730609059333801, "step": 2746 }, { "epoch": 1.48, "learning_rate": 7.257393314871518e-08, "logits/chosen": -2.1402647495269775, "logits/rejected": -2.26004958152771, "logps/chosen": -0.31580859422683716, "logps/rejected": -7.060513019561768, "loss": 0.5892, "rewards/accuracies": 1.0, "rewards/chosen": 0.9446835517883301, "rewards/margins": 0.2200453281402588, "rewards/rejected": 0.7246382236480713, "step": 2747 }, { "epoch": 1.48, "learning_rate": 7.255444548041589e-08, "logits/chosen": -2.1487019062042236, "logits/rejected": -2.1517584323883057, "logps/chosen": -2.718616008758545, "logps/rejected": -4.578943729400635, "loss": 0.3639, "rewards/accuracies": 1.0, "rewards/chosen": 1.2559616565704346, "rewards/margins": 0.8235101699829102, "rewards/rejected": 0.4324514865875244, "step": 2748 }, { "epoch": 1.48, "learning_rate": 7.25349535097068e-08, "logits/chosen": -2.1931629180908203, "logits/rejected": -2.202542781829834, "logps/chosen": -1.9051986932754517, "logps/rejected": -3.0747597217559814, "loss": 0.4641, "rewards/accuracies": 1.0, "rewards/chosen": 1.3097963333129883, "rewards/margins": 0.5265201330184937, "rewards/rejected": 0.7832762002944946, "step": 2749 }, { "epoch": 1.48, "learning_rate": 7.251545724030616e-08, "logits/chosen": -2.1143128871917725, "logits/rejected": -2.3569412231445312, "logps/chosen": -0.661176323890686, "logps/rejected": -0.664165735244751, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.0302495956420898, "rewards/margins": 0.016991257667541504, "rewards/rejected": 1.0132583379745483, "step": 2750 }, { "epoch": 1.48, "learning_rate": 7.2495956675933e-08, "logits/chosen": -2.1187515258789062, "logits/rejected": -2.123762607574463, "logps/chosen": -2.5932376384735107, "logps/rejected": -4.505558013916016, "loss": 0.4628, "rewards/accuracies": 1.0, "rewards/chosen": 1.0106390714645386, "rewards/margins": 0.5302029252052307, "rewards/rejected": 0.48043614625930786, "step": 2751 }, { "epoch": 1.48, "learning_rate": 7.247645182030719e-08, "logits/chosen": -1.9869741201400757, "logits/rejected": -1.9941234588623047, "logps/chosen": -3.1631507873535156, "logps/rejected": -2.906332015991211, "loss": 0.4389, "rewards/accuracies": 1.0, "rewards/chosen": 1.2188886404037476, "rewards/margins": 0.595897912979126, "rewards/rejected": 0.6229907274246216, "step": 2752 }, { "epoch": 1.48, "learning_rate": 7.245694267714942e-08, "logits/chosen": -2.0119240283966064, "logits/rejected": -1.9945623874664307, "logps/chosen": -3.7658751010894775, "logps/rejected": -6.031672477722168, "loss": 0.4491, "rewards/accuracies": 1.0, "rewards/chosen": 1.1116752624511719, "rewards/margins": 0.5676498413085938, "rewards/rejected": 0.5440254211425781, "step": 2753 }, { "epoch": 1.49, "learning_rate": 7.243742925018117e-08, "logits/chosen": -1.9257838726043701, "logits/rejected": -1.9606633186340332, "logps/chosen": -0.9116108417510986, "logps/rejected": -13.558629989624023, "loss": 0.5295, "rewards/accuracies": 1.0, "rewards/chosen": 1.0518721342086792, "rewards/margins": 0.3594370484352112, "rewards/rejected": 0.692435085773468, "step": 2754 }, { "epoch": 1.49, "learning_rate": 7.241791154312477e-08, "logits/chosen": -2.145548105239868, "logits/rejected": -2.150029420852661, "logps/chosen": -3.3678689002990723, "logps/rejected": -9.51618480682373, "loss": 0.3311, "rewards/accuracies": 1.0, "rewards/chosen": 1.291359543800354, "rewards/margins": 0.9352500438690186, "rewards/rejected": 0.35610952973365784, "step": 2755 }, { "epoch": 1.49, "learning_rate": 7.239838955970334e-08, "logits/chosen": -2.122060775756836, "logits/rejected": -2.1498453617095947, "logps/chosen": -9.60426139831543, "logps/rejected": -15.20450210571289, "loss": 0.5867, "rewards/accuracies": 1.0, "rewards/chosen": 1.1950626373291016, "rewards/margins": 0.22554761171340942, "rewards/rejected": 0.9695150256156921, "step": 2756 }, { "epoch": 1.49, "learning_rate": 7.23788633036408e-08, "logits/chosen": -1.989277720451355, "logits/rejected": -2.2987754344940186, "logps/chosen": -0.9009661078453064, "logps/rejected": -0.7979998588562012, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.9614295363426208, "rewards/margins": 0.03945285081863403, "rewards/rejected": 0.9219766855239868, "step": 2757 }, { "epoch": 1.49, "learning_rate": 7.235933277866198e-08, "logits/chosen": -2.0777926445007324, "logits/rejected": -2.313758611679077, "logps/chosen": -3.4209890365600586, "logps/rejected": -3.2799086570739746, "loss": 0.6986, "rewards/accuracies": 0.0, "rewards/chosen": 0.9438641667366028, "rewards/margins": -0.010955512523651123, "rewards/rejected": 0.9548196792602539, "step": 2758 }, { "epoch": 1.49, "learning_rate": 7.233979798849242e-08, "logits/chosen": -2.284133195877075, "logits/rejected": -2.2924892902374268, "logps/chosen": -2.6953213214874268, "logps/rejected": -4.547914981842041, "loss": 0.3414, "rewards/accuracies": 1.0, "rewards/chosen": 1.5259722471237183, "rewards/margins": 0.8992493748664856, "rewards/rejected": 0.6267228722572327, "step": 2759 }, { "epoch": 1.49, "learning_rate": 7.232025893685849e-08, "logits/chosen": -2.036081075668335, "logits/rejected": -2.0261898040771484, "logps/chosen": -2.9337172508239746, "logps/rejected": -5.79025411605835, "loss": 0.4977, "rewards/accuracies": 1.0, "rewards/chosen": 0.8515844345092773, "rewards/margins": 0.43854308128356934, "rewards/rejected": 0.413041353225708, "step": 2760 }, { "epoch": 1.49, "learning_rate": 7.230071562748743e-08, "logits/chosen": -2.08447527885437, "logits/rejected": -2.0502371788024902, "logps/chosen": -11.402917861938477, "logps/rejected": -3.596891164779663, "loss": 0.3326, "rewards/accuracies": 1.0, "rewards/chosen": 1.6412878036499023, "rewards/margins": 0.930065929889679, "rewards/rejected": 0.7112218737602234, "step": 2761 }, { "epoch": 1.49, "learning_rate": 7.228116806410724e-08, "logits/chosen": -2.163983106613159, "logits/rejected": -2.2926061153411865, "logps/chosen": -5.664863109588623, "logps/rejected": -1.4190634489059448, "loss": 0.7586, "rewards/accuracies": 0.0, "rewards/chosen": 0.9174556732177734, "rewards/margins": -0.12691795825958252, "rewards/rejected": 1.044373631477356, "step": 2762 }, { "epoch": 1.49, "learning_rate": 7.226161625044674e-08, "logits/chosen": -2.052083969116211, "logits/rejected": -2.2783730030059814, "logps/chosen": -2.4252800941467285, "logps/rejected": -2.472769260406494, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.8605878949165344, "rewards/margins": 0.03009355068206787, "rewards/rejected": 0.8304943442344666, "step": 2763 }, { "epoch": 1.49, "learning_rate": 7.224206019023559e-08, "logits/chosen": -2.016085386276245, "logits/rejected": -2.3504810333251953, "logps/chosen": -0.4604065418243408, "logps/rejected": -28.39774513244629, "loss": 0.6574, "rewards/accuracies": 1.0, "rewards/chosen": 1.0563563108444214, "rewards/margins": 0.07285010814666748, "rewards/rejected": 0.9835062026977539, "step": 2764 }, { "epoch": 1.49, "learning_rate": 7.222249988720422e-08, "logits/chosen": -2.1355621814727783, "logits/rejected": -2.2062017917633057, "logps/chosen": -0.7199953198432922, "logps/rejected": -0.7164191007614136, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.9462929964065552, "rewards/margins": 0.01564258337020874, "rewards/rejected": 0.9306504130363464, "step": 2765 }, { "epoch": 1.49, "learning_rate": 7.220293534508387e-08, "logits/chosen": -2.043848752975464, "logits/rejected": -2.0725951194763184, "logps/chosen": -4.480222225189209, "logps/rejected": -4.119460105895996, "loss": 0.6017, "rewards/accuracies": 1.0, "rewards/chosen": 1.1434742212295532, "rewards/margins": 0.19199687242507935, "rewards/rejected": 0.9514773488044739, "step": 2766 }, { "epoch": 1.49, "learning_rate": 7.218336656760663e-08, "logits/chosen": -2.0559310913085938, "logits/rejected": -2.062818765640259, "logps/chosen": -6.372293472290039, "logps/rejected": -0.7822117805480957, "loss": 0.4891, "rewards/accuracies": 1.0, "rewards/chosen": 1.2821403741836548, "rewards/margins": 0.46070998907089233, "rewards/rejected": 0.8214303851127625, "step": 2767 }, { "epoch": 1.49, "learning_rate": 7.216379355850539e-08, "logits/chosen": -2.08481764793396, "logits/rejected": -2.085350275039673, "logps/chosen": -2.413177490234375, "logps/rejected": -0.47858476638793945, "loss": 0.4074, "rewards/accuracies": 1.0, "rewards/chosen": 1.4582271575927734, "rewards/margins": 0.687453031539917, "rewards/rejected": 0.7707741260528564, "step": 2768 }, { "epoch": 1.49, "learning_rate": 7.21442163215138e-08, "logits/chosen": -2.1228532791137695, "logits/rejected": -2.1220085620880127, "logps/chosen": -1.126452088356018, "logps/rejected": -1.1376523971557617, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 0.8759952783584595, "rewards/margins": 0.05039501190185547, "rewards/rejected": 0.825600266456604, "step": 2769 }, { "epoch": 1.49, "learning_rate": 7.212463486036637e-08, "logits/chosen": -2.0908303260803223, "logits/rejected": -2.0795302391052246, "logps/chosen": -6.833840847015381, "logps/rejected": -2.6316630840301514, "loss": 0.4663, "rewards/accuracies": 1.0, "rewards/chosen": 1.1378778219223022, "rewards/margins": 0.5207575559616089, "rewards/rejected": 0.6171202659606934, "step": 2770 }, { "epoch": 1.49, "learning_rate": 7.210504917879837e-08, "logits/chosen": -2.0053632259368896, "logits/rejected": -2.01721453666687, "logps/chosen": -1.6793051958084106, "logps/rejected": -2.0973503589630127, "loss": 0.4688, "rewards/accuracies": 1.0, "rewards/chosen": 1.170617938041687, "rewards/margins": 0.5139374136924744, "rewards/rejected": 0.6566805243492126, "step": 2771 }, { "epoch": 1.5, "learning_rate": 7.208545928054593e-08, "logits/chosen": -2.1117923259735107, "logits/rejected": -2.3701133728027344, "logps/chosen": -0.22281204164028168, "logps/rejected": -0.19608207046985626, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.7679501175880432, "rewards/margins": 0.008848071098327637, "rewards/rejected": 0.7591020464897156, "step": 2772 }, { "epoch": 1.5, "learning_rate": 7.206586516934591e-08, "logits/chosen": -2.0187997817993164, "logits/rejected": -2.0231945514678955, "logps/chosen": -3.1307315826416016, "logps/rejected": -3.106858491897583, "loss": 0.4208, "rewards/accuracies": 1.0, "rewards/chosen": 1.3371022939682007, "rewards/margins": 0.6477687358856201, "rewards/rejected": 0.6893335580825806, "step": 2773 }, { "epoch": 1.5, "learning_rate": 7.204626684893603e-08, "logits/chosen": -2.108210802078247, "logits/rejected": -2.077004909515381, "logps/chosen": -11.29376220703125, "logps/rejected": -2.0929880142211914, "loss": 0.2971, "rewards/accuracies": 1.0, "rewards/chosen": 1.730345606803894, "rewards/margins": 1.061492681503296, "rewards/rejected": 0.6688529849052429, "step": 2774 }, { "epoch": 1.5, "learning_rate": 7.202666432305483e-08, "logits/chosen": -1.9871182441711426, "logits/rejected": -2.2713985443115234, "logps/chosen": -0.6566585898399353, "logps/rejected": -0.560133695602417, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.7787950038909912, "rewards/margins": -0.0005509257316589355, "rewards/rejected": 0.7793459296226501, "step": 2775 }, { "epoch": 1.5, "learning_rate": 7.20070575954416e-08, "logits/chosen": -2.1506948471069336, "logits/rejected": -2.152841806411743, "logps/chosen": -0.34395959973335266, "logps/rejected": -5.032596111297607, "loss": 0.4694, "rewards/accuracies": 1.0, "rewards/chosen": 0.843288242816925, "rewards/margins": 0.512421190738678, "rewards/rejected": 0.33086705207824707, "step": 2776 }, { "epoch": 1.5, "learning_rate": 7.198744666983646e-08, "logits/chosen": -1.9379023313522339, "logits/rejected": -1.9383777379989624, "logps/chosen": -1.5428427457809448, "logps/rejected": -5.768873691558838, "loss": 0.5175, "rewards/accuracies": 1.0, "rewards/chosen": 1.0652899742126465, "rewards/margins": 0.388751745223999, "rewards/rejected": 0.6765382289886475, "step": 2777 }, { "epoch": 1.5, "learning_rate": 7.196783154998031e-08, "logits/chosen": -1.9305400848388672, "logits/rejected": -2.2432713508605957, "logps/chosen": -3.0724642276763916, "logps/rejected": -6.162341117858887, "loss": 0.6468, "rewards/accuracies": 1.0, "rewards/chosen": 0.8591465353965759, "rewards/margins": 0.09485387802124023, "rewards/rejected": 0.7642926573753357, "step": 2778 }, { "epoch": 1.5, "learning_rate": 7.194821223961487e-08, "logits/chosen": -1.9979904890060425, "logits/rejected": -2.000317096710205, "logps/chosen": -1.3155078887939453, "logps/rejected": -2.720486879348755, "loss": 0.4925, "rewards/accuracies": 1.0, "rewards/chosen": 1.125219702720642, "rewards/margins": 0.45193564891815186, "rewards/rejected": 0.6732840538024902, "step": 2779 }, { "epoch": 1.5, "learning_rate": 7.192858874248266e-08, "logits/chosen": -2.162991523742676, "logits/rejected": -2.277543067932129, "logps/chosen": -3.110783100128174, "logps/rejected": -2.6730802059173584, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.5859893560409546, "rewards/margins": -0.014096319675445557, "rewards/rejected": 0.6000856757164001, "step": 2780 }, { "epoch": 1.5, "learning_rate": 7.190896106232698e-08, "logits/chosen": -2.092994213104248, "logits/rejected": -2.3192873001098633, "logps/chosen": -1.3773527145385742, "logps/rejected": -1.2358448505401611, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.9287800788879395, "rewards/margins": 0.006826221942901611, "rewards/rejected": 0.9219538569450378, "step": 2781 }, { "epoch": 1.5, "learning_rate": 7.188932920289195e-08, "logits/chosen": -2.149460792541504, "logits/rejected": -2.1148340702056885, "logps/chosen": -21.426240921020508, "logps/rejected": -3.347187042236328, "loss": 0.2289, "rewards/accuracies": 1.0, "rewards/chosen": 1.9502019882202148, "rewards/margins": 1.3580572605133057, "rewards/rejected": 0.592144787311554, "step": 2782 }, { "epoch": 1.5, "learning_rate": 7.186969316792247e-08, "logits/chosen": -2.142392873764038, "logits/rejected": -2.13521146774292, "logps/chosen": -3.339282989501953, "logps/rejected": -9.008630752563477, "loss": 0.352, "rewards/accuracies": 1.0, "rewards/chosen": 1.3130712509155273, "rewards/margins": 0.8630361557006836, "rewards/rejected": 0.45003509521484375, "step": 2783 }, { "epoch": 1.5, "learning_rate": 7.185005296116426e-08, "logits/chosen": -2.230659246444702, "logits/rejected": -2.233462333679199, "logps/chosen": -0.38247981667518616, "logps/rejected": -3.2048840522766113, "loss": 0.5201, "rewards/accuracies": 1.0, "rewards/chosen": 0.7385401725769043, "rewards/margins": 0.38245370984077454, "rewards/rejected": 0.35608646273612976, "step": 2784 }, { "epoch": 1.5, "learning_rate": 7.18304085863638e-08, "logits/chosen": -2.1228437423706055, "logits/rejected": -2.359119176864624, "logps/chosen": -0.5028018355369568, "logps/rejected": -0.5464227199554443, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.8308038711547852, "rewards/margins": 0.032810211181640625, "rewards/rejected": 0.7979936599731445, "step": 2785 }, { "epoch": 1.5, "learning_rate": 7.181076004726838e-08, "logits/chosen": -2.126551866531372, "logits/rejected": -2.14150071144104, "logps/chosen": -0.5388476848602295, "logps/rejected": -9.0975980758667, "loss": 0.4686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9592155814170837, "rewards/margins": 0.5145115852355957, "rewards/rejected": 0.44470396637916565, "step": 2786 }, { "epoch": 1.5, "learning_rate": 7.179110734762612e-08, "logits/chosen": -2.127173662185669, "logits/rejected": -2.1210014820098877, "logps/chosen": -2.8716440200805664, "logps/rejected": -2.415968179702759, "loss": 0.4359, "rewards/accuracies": 1.0, "rewards/chosen": 1.306176781654358, "rewards/margins": 0.604448139667511, "rewards/rejected": 0.7017286419868469, "step": 2787 }, { "epoch": 1.5, "learning_rate": 7.177145049118588e-08, "logits/chosen": -1.940340280532837, "logits/rejected": -2.2372961044311523, "logps/chosen": -0.4198269248008728, "logps/rejected": -0.4053262770175934, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.8430418372154236, "rewards/margins": 0.03526413440704346, "rewards/rejected": 0.8077777028083801, "step": 2788 }, { "epoch": 1.5, "learning_rate": 7.175178948169734e-08, "logits/chosen": -2.0588486194610596, "logits/rejected": -2.057117462158203, "logps/chosen": -0.5684335827827454, "logps/rejected": -4.842674732208252, "loss": 0.5499, "rewards/accuracies": 1.0, "rewards/chosen": 1.0198036432266235, "rewards/margins": 0.31058233976364136, "rewards/rejected": 0.7092213034629822, "step": 2789 }, { "epoch": 1.5, "learning_rate": 7.173212432291094e-08, "logits/chosen": -1.99867582321167, "logits/rejected": -2.000861644744873, "logps/chosen": -0.2574648857116699, "logps/rejected": -4.840301036834717, "loss": 0.5181, "rewards/accuracies": 1.0, "rewards/chosen": 0.8888967633247375, "rewards/margins": 0.3874002695083618, "rewards/rejected": 0.5014964938163757, "step": 2790 }, { "epoch": 1.51, "learning_rate": 7.171245501857798e-08, "logits/chosen": -2.0142769813537598, "logits/rejected": -2.003931760787964, "logps/chosen": -6.51649284362793, "logps/rejected": -3.8583974838256836, "loss": 0.5742, "rewards/accuracies": 1.0, "rewards/chosen": 1.1156386137008667, "rewards/margins": 0.25391459465026855, "rewards/rejected": 0.8617240190505981, "step": 2791 }, { "epoch": 1.51, "learning_rate": 7.169278157245051e-08, "logits/chosen": -2.0326168537139893, "logits/rejected": -2.295955181121826, "logps/chosen": -4.206288814544678, "logps/rejected": -7.24043607711792, "loss": 0.617, "rewards/accuracies": 1.0, "rewards/chosen": 1.1879100799560547, "rewards/margins": 0.15847885608673096, "rewards/rejected": 1.0294312238693237, "step": 2792 }, { "epoch": 1.51, "learning_rate": 7.167310398828135e-08, "logits/chosen": -1.9610710144042969, "logits/rejected": -1.963300347328186, "logps/chosen": -3.191181182861328, "logps/rejected": -6.775906562805176, "loss": 0.3861, "rewards/accuracies": 1.0, "rewards/chosen": 1.3715795278549194, "rewards/margins": 0.7524431943893433, "rewards/rejected": 0.6191363334655762, "step": 2793 }, { "epoch": 1.51, "learning_rate": 7.165342226982414e-08, "logits/chosen": -2.080037832260132, "logits/rejected": -2.079725980758667, "logps/chosen": -0.7403220534324646, "logps/rejected": -4.745219707489014, "loss": 0.5455, "rewards/accuracies": 1.0, "rewards/chosen": 0.8788246512413025, "rewards/margins": 0.3210393786430359, "rewards/rejected": 0.5577852725982666, "step": 2794 }, { "epoch": 1.51, "learning_rate": 7.163373642083328e-08, "logits/chosen": -2.0283730030059814, "logits/rejected": -2.2396297454833984, "logps/chosen": -1.8101707696914673, "logps/rejected": -1.9445127248764038, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.8111942410469055, "rewards/margins": 0.01331859827041626, "rewards/rejected": 0.7978756427764893, "step": 2795 }, { "epoch": 1.51, "learning_rate": 7.161404644506399e-08, "logits/chosen": -2.0940709114074707, "logits/rejected": -2.10052752494812, "logps/chosen": -2.0245814323425293, "logps/rejected": -3.4021432399749756, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 1.136353850364685, "rewards/margins": 0.47955459356307983, "rewards/rejected": 0.6567992568016052, "step": 2796 }, { "epoch": 1.51, "learning_rate": 7.159435234627226e-08, "logits/chosen": -2.0390512943267822, "logits/rejected": -2.263362407684326, "logps/chosen": -1.370916724205017, "logps/rejected": -1.2714862823486328, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.9335780143737793, "rewards/margins": 0.011584758758544922, "rewards/rejected": 0.9219932556152344, "step": 2797 }, { "epoch": 1.51, "learning_rate": 7.157465412821488e-08, "logits/chosen": -2.0435173511505127, "logits/rejected": -2.279546022415161, "logps/chosen": -1.5563291311264038, "logps/rejected": -1.8354170322418213, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 1.0446547269821167, "rewards/margins": 0.017570018768310547, "rewards/rejected": 1.0270847082138062, "step": 2798 }, { "epoch": 1.51, "learning_rate": 7.155495179464941e-08, "logits/chosen": -2.054593563079834, "logits/rejected": -2.3082377910614014, "logps/chosen": -4.752318382263184, "logps/rejected": -4.564396858215332, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 1.117905616760254, "rewards/margins": -0.01439523696899414, "rewards/rejected": 1.132300853729248, "step": 2799 }, { "epoch": 1.51, "learning_rate": 7.153524534933422e-08, "logits/chosen": -2.1096127033233643, "logits/rejected": -2.3605520725250244, "logps/chosen": -0.9504857063293457, "logps/rejected": -25.27172088623047, "loss": 0.471, "rewards/accuracies": 1.0, "rewards/chosen": 1.8626171350479126, "rewards/margins": 0.508256196975708, "rewards/rejected": 1.3543609380722046, "step": 2800 }, { "epoch": 1.51, "learning_rate": 7.151553479602839e-08, "logits/chosen": -2.0879416465759277, "logits/rejected": -2.3409676551818848, "logps/chosen": -0.9813073873519897, "logps/rejected": -1.0717968940734863, "loss": 0.6718, "rewards/accuracies": 1.0, "rewards/chosen": 0.8890013694763184, "rewards/margins": 0.04310518503189087, "rewards/rejected": 0.8458961844444275, "step": 2801 }, { "epoch": 1.51, "learning_rate": 7.149582013849187e-08, "logits/chosen": -2.156153917312622, "logits/rejected": -2.298706293106079, "logps/chosen": -0.5232886075973511, "logps/rejected": -0.540729284286499, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.9864482879638672, "rewards/margins": 0.0009220242500305176, "rewards/rejected": 0.9855262637138367, "step": 2802 }, { "epoch": 1.51, "learning_rate": 7.147610138048538e-08, "logits/chosen": -2.2129244804382324, "logits/rejected": -2.148754358291626, "logps/chosen": -30.244556427001953, "logps/rejected": -5.821531295776367, "loss": 0.2551, "rewards/accuracies": 1.0, "rewards/chosen": 1.8256072998046875, "rewards/margins": 1.2359819412231445, "rewards/rejected": 0.5896252989768982, "step": 2803 }, { "epoch": 1.51, "learning_rate": 7.145637852577038e-08, "logits/chosen": -1.9797881841659546, "logits/rejected": -2.235365152359009, "logps/chosen": -2.846040964126587, "logps/rejected": -2.7091751098632812, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.962894082069397, "rewards/margins": 0.005314648151397705, "rewards/rejected": 0.9575794339179993, "step": 2804 }, { "epoch": 1.51, "learning_rate": 7.143665157810914e-08, "logits/chosen": -2.1634483337402344, "logits/rejected": -2.16021466255188, "logps/chosen": -3.7822113037109375, "logps/rejected": -4.003768444061279, "loss": 0.3106, "rewards/accuracies": 1.0, "rewards/chosen": 1.5141884088516235, "rewards/margins": 1.0098432302474976, "rewards/rejected": 0.504345178604126, "step": 2805 }, { "epoch": 1.51, "learning_rate": 7.14169205412647e-08, "logits/chosen": -2.121180295944214, "logits/rejected": -2.1160175800323486, "logps/chosen": -1.0374716520309448, "logps/rejected": -5.676374435424805, "loss": 0.5427, "rewards/accuracies": 1.0, "rewards/chosen": 1.0459402799606323, "rewards/margins": 0.32766443490982056, "rewards/rejected": 0.7182758450508118, "step": 2806 }, { "epoch": 1.51, "learning_rate": 7.139718541900091e-08, "logits/chosen": -2.0101428031921387, "logits/rejected": -2.236339569091797, "logps/chosen": -0.5433169603347778, "logps/rejected": -0.5502528548240662, "loss": 0.6663, "rewards/accuracies": 1.0, "rewards/chosen": 0.9975151419639587, "rewards/margins": 0.054448843002319336, "rewards/rejected": 0.9430662989616394, "step": 2807 }, { "epoch": 1.51, "learning_rate": 7.137744621508235e-08, "logits/chosen": -2.095926284790039, "logits/rejected": -2.2686333656311035, "logps/chosen": -1.0196439027786255, "logps/rejected": -6.454080581665039, "loss": 0.5524, "rewards/accuracies": 1.0, "rewards/chosen": 1.0255810022354126, "rewards/margins": 0.3045728802680969, "rewards/rejected": 0.7210081219673157, "step": 2808 }, { "epoch": 1.52, "learning_rate": 7.135770293327441e-08, "logits/chosen": -2.1835880279541016, "logits/rejected": -2.184201240539551, "logps/chosen": -0.6528744101524353, "logps/rejected": -2.25978684425354, "loss": 0.5402, "rewards/accuracies": 1.0, "rewards/chosen": 1.0772368907928467, "rewards/margins": 0.33369505405426025, "rewards/rejected": 0.7435418367385864, "step": 2809 }, { "epoch": 1.52, "learning_rate": 7.133795557734326e-08, "logits/chosen": -1.9833966493606567, "logits/rejected": -1.9630515575408936, "logps/chosen": -16.065155029296875, "logps/rejected": -2.171795129776001, "loss": 0.275, "rewards/accuracies": 1.0, "rewards/chosen": 1.8683913946151733, "rewards/margins": 1.1501824855804443, "rewards/rejected": 0.7182089686393738, "step": 2810 }, { "epoch": 1.52, "learning_rate": 7.131820415105583e-08, "logits/chosen": -2.037747383117676, "logits/rejected": -2.2367403507232666, "logps/chosen": -3.181079864501953, "logps/rejected": -3.011441469192505, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.8334976434707642, "rewards/margins": 0.023125767707824707, "rewards/rejected": 0.8103718757629395, "step": 2811 }, { "epoch": 1.52, "learning_rate": 7.129844865817985e-08, "logits/chosen": -2.10829758644104, "logits/rejected": -2.106921672821045, "logps/chosen": -0.8524263501167297, "logps/rejected": -1.7926816940307617, "loss": 0.6492, "rewards/accuracies": 1.0, "rewards/chosen": 0.9595858454704285, "rewards/margins": 0.08993840217590332, "rewards/rejected": 0.8696474432945251, "step": 2812 }, { "epoch": 1.52, "learning_rate": 7.127868910248378e-08, "logits/chosen": -2.0921266078948975, "logits/rejected": -2.082874298095703, "logps/chosen": -1.6406840085983276, "logps/rejected": -11.729104042053223, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": 1.4075740575790405, "rewards/margins": 0.9727638959884644, "rewards/rejected": 0.43481016159057617, "step": 2813 }, { "epoch": 1.52, "learning_rate": 7.12589254877369e-08, "logits/chosen": -2.011453866958618, "logits/rejected": -2.2101731300354004, "logps/chosen": -1.7420437335968018, "logps/rejected": -1.531911015510559, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 0.7241629958152771, "rewards/margins": -0.0020993947982788086, "rewards/rejected": 0.7262623906135559, "step": 2814 }, { "epoch": 1.52, "learning_rate": 7.12391578177093e-08, "logits/chosen": -2.138888120651245, "logits/rejected": -2.3211584091186523, "logps/chosen": -1.918654441833496, "logps/rejected": -7.491297721862793, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 1.1914715766906738, "rewards/margins": 0.13238251209259033, "rewards/rejected": 1.0590890645980835, "step": 2815 }, { "epoch": 1.52, "learning_rate": 7.12193860961717e-08, "logits/chosen": -2.0788371562957764, "logits/rejected": -2.2757484912872314, "logps/chosen": -5.015922546386719, "logps/rejected": -4.591241836547852, "loss": 0.7032, "rewards/accuracies": 0.0, "rewards/chosen": 0.9914805293083191, "rewards/margins": -0.020024478435516357, "rewards/rejected": 1.0115050077438354, "step": 2816 }, { "epoch": 1.52, "learning_rate": 7.119961032689576e-08, "logits/chosen": -2.1273632049560547, "logits/rejected": -2.286323308944702, "logps/chosen": -0.626141369342804, "logps/rejected": -0.6789585947990417, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.7430877089500427, "rewards/margins": 0.02736639976501465, "rewards/rejected": 0.7157213091850281, "step": 2817 }, { "epoch": 1.52, "learning_rate": 7.117983051365381e-08, "logits/chosen": -2.0586392879486084, "logits/rejected": -2.354947328567505, "logps/chosen": -0.11489465832710266, "logps/rejected": -0.1198255717754364, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 0.933834969997406, "rewards/margins": -0.009771227836608887, "rewards/rejected": 0.9436061978340149, "step": 2818 }, { "epoch": 1.52, "learning_rate": 7.1160046660219e-08, "logits/chosen": -2.0219063758850098, "logits/rejected": -2.2397518157958984, "logps/chosen": -8.796688079833984, "logps/rejected": -9.103404998779297, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 0.464806467294693, "rewards/margins": 0.039659589529037476, "rewards/rejected": 0.4251468777656555, "step": 2819 }, { "epoch": 1.52, "learning_rate": 7.11402587703652e-08, "logits/chosen": -2.037489891052246, "logits/rejected": -2.0399985313415527, "logps/chosen": -4.375156879425049, "logps/rejected": -3.22731351852417, "loss": 0.3277, "rewards/accuracies": 1.0, "rewards/chosen": 1.5944050550460815, "rewards/margins": 0.9473353624343872, "rewards/rejected": 0.6470696926116943, "step": 2820 }, { "epoch": 1.52, "learning_rate": 7.11204668478671e-08, "logits/chosen": -2.0664331912994385, "logits/rejected": -2.0551888942718506, "logps/chosen": -0.3267061710357666, "logps/rejected": -6.419426918029785, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 0.9263647198677063, "rewards/margins": 0.5306499600410461, "rewards/rejected": 0.39571475982666016, "step": 2821 }, { "epoch": 1.52, "learning_rate": 7.110067089650014e-08, "logits/chosen": -2.0434114933013916, "logits/rejected": -2.0403707027435303, "logps/chosen": -2.1882517337799072, "logps/rejected": -5.101448059082031, "loss": 0.4158, "rewards/accuracies": 1.0, "rewards/chosen": 1.2536996603012085, "rewards/margins": 0.662573516368866, "rewards/rejected": 0.5911261439323425, "step": 2822 }, { "epoch": 1.52, "learning_rate": 7.108087092004053e-08, "logits/chosen": -2.0743329524993896, "logits/rejected": -2.311246871948242, "logps/chosen": -7.367435455322266, "logps/rejected": -1.7791036367416382, "loss": 0.7966, "rewards/accuracies": 0.0, "rewards/chosen": 0.6474266052246094, "rewards/margins": -0.1971237063407898, "rewards/rejected": 0.8445503115653992, "step": 2823 }, { "epoch": 1.52, "learning_rate": 7.106106692226523e-08, "logits/chosen": -2.0886611938476562, "logits/rejected": -2.120159387588501, "logps/chosen": -3.314624309539795, "logps/rejected": -5.380687713623047, "loss": 0.3813, "rewards/accuracies": 1.0, "rewards/chosen": 1.6154526472091675, "rewards/margins": 0.7675999402999878, "rewards/rejected": 0.8478527069091797, "step": 2824 }, { "epoch": 1.52, "learning_rate": 7.104125890695202e-08, "logits/chosen": -2.115274429321289, "logits/rejected": -2.1156022548675537, "logps/chosen": -2.8338263034820557, "logps/rejected": -1.5480878353118896, "loss": 0.5408, "rewards/accuracies": 1.0, "rewards/chosen": 1.3195654153823853, "rewards/margins": 0.33214420080184937, "rewards/rejected": 0.9874212145805359, "step": 2825 }, { "epoch": 1.52, "learning_rate": 7.102144687787937e-08, "logits/chosen": -1.9659383296966553, "logits/rejected": -2.2557501792907715, "logps/chosen": -0.3582358956336975, "logps/rejected": -0.34799668192863464, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8085402846336365, "rewards/margins": 0.006542682647705078, "rewards/rejected": 0.8019976019859314, "step": 2826 }, { "epoch": 1.52, "learning_rate": 7.100163083882657e-08, "logits/chosen": -2.059908866882324, "logits/rejected": -2.0584168434143066, "logps/chosen": -1.5539199113845825, "logps/rejected": -4.996923446655273, "loss": 0.4538, "rewards/accuracies": 1.0, "rewards/chosen": 1.0750997066497803, "rewards/margins": 0.5546873211860657, "rewards/rejected": 0.5204123854637146, "step": 2827 }, { "epoch": 1.53, "learning_rate": 7.098181079357369e-08, "logits/chosen": -1.9617136716842651, "logits/rejected": -1.968933343887329, "logps/chosen": -1.841538906097412, "logps/rejected": -2.9063313007354736, "loss": 0.4771, "rewards/accuracies": 1.0, "rewards/chosen": 1.1020851135253906, "rewards/margins": 0.4920275807380676, "rewards/rejected": 0.610057532787323, "step": 2828 }, { "epoch": 1.53, "learning_rate": 7.096198674590149e-08, "logits/chosen": -2.0870015621185303, "logits/rejected": -2.094217300415039, "logps/chosen": -4.96152400970459, "logps/rejected": -9.83828353881836, "loss": 0.4222, "rewards/accuracies": 1.0, "rewards/chosen": 1.1379674673080444, "rewards/margins": 0.6436866521835327, "rewards/rejected": 0.4942808151245117, "step": 2829 }, { "epoch": 1.53, "learning_rate": 7.094215869959154e-08, "logits/chosen": -2.1922786235809326, "logits/rejected": -2.3403472900390625, "logps/chosen": -16.062515258789062, "logps/rejected": -16.56858253479004, "loss": 0.6, "rewards/accuracies": 1.0, "rewards/chosen": 1.1732994318008423, "rewards/margins": 0.19589370489120483, "rewards/rejected": 0.9774057269096375, "step": 2830 }, { "epoch": 1.53, "learning_rate": 7.092232665842621e-08, "logits/chosen": -2.1241018772125244, "logits/rejected": -2.125844717025757, "logps/chosen": -0.19848129153251648, "logps/rejected": -4.335200309753418, "loss": 0.5407, "rewards/accuracies": 1.0, "rewards/chosen": 0.8429217338562012, "rewards/margins": 0.3323853611946106, "rewards/rejected": 0.5105363726615906, "step": 2831 }, { "epoch": 1.53, "learning_rate": 7.090249062618856e-08, "logits/chosen": -2.1484439373016357, "logits/rejected": -2.1482698917388916, "logps/chosen": -0.21548375487327576, "logps/rejected": -3.0322582721710205, "loss": 0.4928, "rewards/accuracies": 1.0, "rewards/chosen": 1.0237836837768555, "rewards/margins": 0.45120304822921753, "rewards/rejected": 0.5725806355476379, "step": 2832 }, { "epoch": 1.53, "learning_rate": 7.088265060666247e-08, "logits/chosen": -2.096369504928589, "logits/rejected": -2.088998794555664, "logps/chosen": -0.6531785726547241, "logps/rejected": -3.7888553142547607, "loss": 0.5808, "rewards/accuracies": 1.0, "rewards/chosen": 0.937837541103363, "rewards/margins": 0.23888427019119263, "rewards/rejected": 0.6989532709121704, "step": 2833 }, { "epoch": 1.53, "learning_rate": 7.086280660363253e-08, "logits/chosen": -2.1537702083587646, "logits/rejected": -2.1209635734558105, "logps/chosen": -14.986920356750488, "logps/rejected": -6.64286994934082, "loss": 0.3722, "rewards/accuracies": 1.0, "rewards/chosen": 1.5956534147262573, "rewards/margins": 0.7964826226234436, "rewards/rejected": 0.7991707921028137, "step": 2834 }, { "epoch": 1.53, "learning_rate": 7.084295862088411e-08, "logits/chosen": -1.9701530933380127, "logits/rejected": -2.2674317359924316, "logps/chosen": -10.044337272644043, "logps/rejected": -9.986435890197754, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.34743985533714294, "rewards/margins": -0.004312723875045776, "rewards/rejected": 0.3517525792121887, "step": 2835 }, { "epoch": 1.53, "learning_rate": 7.082310666220337e-08, "logits/chosen": -1.9462969303131104, "logits/rejected": -2.238163948059082, "logps/chosen": -1.0742155313491821, "logps/rejected": -1.097927451133728, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.8307774662971497, "rewards/margins": 0.029222607612609863, "rewards/rejected": 0.8015548586845398, "step": 2836 }, { "epoch": 1.53, "learning_rate": 7.08032507313772e-08, "logits/chosen": -2.00323486328125, "logits/rejected": -2.25089430809021, "logps/chosen": -0.3880525231361389, "logps/rejected": -0.43564221262931824, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 1.0141395330429077, "rewards/margins": 0.003486156463623047, "rewards/rejected": 1.0106533765792847, "step": 2837 }, { "epoch": 1.53, "learning_rate": 7.078339083219325e-08, "logits/chosen": -2.0534512996673584, "logits/rejected": -2.117245674133301, "logps/chosen": -9.883468627929688, "logps/rejected": -7.456804275512695, "loss": 0.6018, "rewards/accuracies": 1.0, "rewards/chosen": 1.2158085107803345, "rewards/margins": 0.19182837009429932, "rewards/rejected": 1.0239801406860352, "step": 2838 }, { "epoch": 1.53, "learning_rate": 7.07635269684399e-08, "logits/chosen": -2.1486449241638184, "logits/rejected": -2.3281900882720947, "logps/chosen": -10.122105598449707, "logps/rejected": -9.944038391113281, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.8231345415115356, "rewards/margins": -0.012362182140350342, "rewards/rejected": 0.835496723651886, "step": 2839 }, { "epoch": 1.53, "learning_rate": 7.074365914390634e-08, "logits/chosen": -1.9651846885681152, "logits/rejected": -2.2888357639312744, "logps/chosen": -0.6883423924446106, "logps/rejected": -0.752872884273529, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.776317298412323, "rewards/margins": 0.0040926337242126465, "rewards/rejected": 0.7722246646881104, "step": 2840 }, { "epoch": 1.53, "learning_rate": 7.072378736238247e-08, "logits/chosen": -2.1306967735290527, "logits/rejected": -2.1358540058135986, "logps/chosen": -1.5963821411132812, "logps/rejected": -3.6595914363861084, "loss": 0.4772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9547669291496277, "rewards/margins": 0.49184396862983704, "rewards/rejected": 0.46292296051979065, "step": 2841 }, { "epoch": 1.53, "learning_rate": 7.0703911627659e-08, "logits/chosen": -2.1099913120269775, "logits/rejected": -2.249680280685425, "logps/chosen": -0.33573710918426514, "logps/rejected": -0.3254568576812744, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.8505503535270691, "rewards/margins": 0.02749556303024292, "rewards/rejected": 0.8230547904968262, "step": 2842 }, { "epoch": 1.53, "learning_rate": 7.068403194352732e-08, "logits/chosen": -2.054886817932129, "logits/rejected": -2.0668532848358154, "logps/chosen": -4.339860916137695, "logps/rejected": -10.713865280151367, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": 1.7165197134017944, "rewards/margins": 1.2164063453674316, "rewards/rejected": 0.500113308429718, "step": 2843 }, { "epoch": 1.53, "learning_rate": 7.066414831377962e-08, "logits/chosen": -2.200000286102295, "logits/rejected": -2.281743288040161, "logps/chosen": -0.9055168628692627, "logps/rejected": -1.8894931077957153, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 1.0302330255508423, "rewards/margins": 0.050278306007385254, "rewards/rejected": 0.979954719543457, "step": 2844 }, { "epoch": 1.53, "learning_rate": 7.064426074220885e-08, "logits/chosen": -2.0839548110961914, "logits/rejected": -2.094378709793091, "logps/chosen": -3.5659167766571045, "logps/rejected": -3.233358144760132, "loss": 0.3092, "rewards/accuracies": 1.0, "rewards/chosen": 1.4994351863861084, "rewards/margins": 1.0151362419128418, "rewards/rejected": 0.4842989146709442, "step": 2845 }, { "epoch": 1.54, "learning_rate": 7.062436923260868e-08, "logits/chosen": -2.1835098266601562, "logits/rejected": -2.3090314865112305, "logps/chosen": -0.3349933624267578, "logps/rejected": -0.3525639772415161, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.8313905596733093, "rewards/margins": 0.005156040191650391, "rewards/rejected": 0.8262345194816589, "step": 2846 }, { "epoch": 1.54, "learning_rate": 7.060447378877354e-08, "logits/chosen": -2.1482093334198, "logits/rejected": -2.1451690196990967, "logps/chosen": -3.600513458251953, "logps/rejected": -7.260907173156738, "loss": 0.3907, "rewards/accuracies": 1.0, "rewards/chosen": 1.4773839712142944, "rewards/margins": 0.7381840348243713, "rewards/rejected": 0.7391999363899231, "step": 2847 }, { "epoch": 1.54, "learning_rate": 7.058457441449865e-08, "logits/chosen": -2.0569238662719727, "logits/rejected": -2.25730037689209, "logps/chosen": -1.4265347719192505, "logps/rejected": -6.513782024383545, "loss": 0.6163, "rewards/accuracies": 1.0, "rewards/chosen": 0.8447472453117371, "rewards/margins": 0.1600092053413391, "rewards/rejected": 0.684738039970398, "step": 2848 }, { "epoch": 1.54, "learning_rate": 7.056467111357993e-08, "logits/chosen": -2.0766079425811768, "logits/rejected": -2.081557035446167, "logps/chosen": -1.6893372535705566, "logps/rejected": -2.5458590984344482, "loss": 0.4741, "rewards/accuracies": 1.0, "rewards/chosen": 1.1615982055664062, "rewards/margins": 0.5000519752502441, "rewards/rejected": 0.6615462303161621, "step": 2849 }, { "epoch": 1.54, "learning_rate": 7.054476388981404e-08, "logits/chosen": -2.075355052947998, "logits/rejected": -2.2596232891082764, "logps/chosen": -5.820143699645996, "logps/rejected": -0.4169209599494934, "loss": 0.8277, "rewards/accuracies": 0.0, "rewards/chosen": 0.6157437562942505, "rewards/margins": -0.253200888633728, "rewards/rejected": 0.8689446449279785, "step": 2850 }, { "epoch": 1.54, "learning_rate": 7.052485274699846e-08, "logits/chosen": -2.055222272872925, "logits/rejected": -2.244844913482666, "logps/chosen": -1.6556198596954346, "logps/rejected": -1.6407005786895752, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.7314788699150085, "rewards/margins": 0.029199600219726562, "rewards/rejected": 0.702279269695282, "step": 2851 }, { "epoch": 1.54, "learning_rate": 7.050493768893133e-08, "logits/chosen": -1.9756056070327759, "logits/rejected": -1.9724974632263184, "logps/chosen": -2.888026714324951, "logps/rejected": -3.969810962677002, "loss": 0.3698, "rewards/accuracies": 1.0, "rewards/chosen": 1.372117280960083, "rewards/margins": 0.8041375875473022, "rewards/rejected": 0.5679796934127808, "step": 2852 }, { "epoch": 1.54, "learning_rate": 7.048501871941162e-08, "logits/chosen": -2.1803767681121826, "logits/rejected": -2.2687647342681885, "logps/chosen": -4.192117214202881, "logps/rejected": -6.394222259521484, "loss": 0.6524, "rewards/accuracies": 1.0, "rewards/chosen": 0.9754894375801086, "rewards/margins": 0.08316099643707275, "rewards/rejected": 0.8923284411430359, "step": 2853 }, { "epoch": 1.54, "learning_rate": 7.046509584223897e-08, "logits/chosen": -1.9959150552749634, "logits/rejected": -2.2907822132110596, "logps/chosen": -1.096447229385376, "logps/rejected": -0.9678608179092407, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8331508636474609, "rewards/margins": 0.012597739696502686, "rewards/rejected": 0.8205531239509583, "step": 2854 }, { "epoch": 1.54, "learning_rate": 7.044516906121382e-08, "logits/chosen": -2.089960813522339, "logits/rejected": -2.296417236328125, "logps/chosen": -1.024022102355957, "logps/rejected": -1.1598923206329346, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 1.0843003988265991, "rewards/margins": 0.017435073852539062, "rewards/rejected": 1.06686532497406, "step": 2855 }, { "epoch": 1.54, "learning_rate": 7.042523838013734e-08, "logits/chosen": -2.117375373840332, "logits/rejected": -2.26840877532959, "logps/chosen": -0.4675932228565216, "logps/rejected": -0.5287026762962341, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.8282252550125122, "rewards/margins": -0.004113256931304932, "rewards/rejected": 0.8323385119438171, "step": 2856 }, { "epoch": 1.54, "learning_rate": 7.040530380281142e-08, "logits/chosen": -2.1000940799713135, "logits/rejected": -2.29551362991333, "logps/chosen": -1.6022326946258545, "logps/rejected": -1.2570065259933472, "loss": 0.7198, "rewards/accuracies": 0.0, "rewards/chosen": 0.8990155458450317, "rewards/margins": -0.052699387073516846, "rewards/rejected": 0.9517149329185486, "step": 2857 }, { "epoch": 1.54, "learning_rate": 7.038536533303871e-08, "logits/chosen": -2.1581716537475586, "logits/rejected": -2.265246629714966, "logps/chosen": -9.422837257385254, "logps/rejected": -8.980375289916992, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": 0.46590375900268555, "rewards/margins": -0.018542587757110596, "rewards/rejected": 0.48444634675979614, "step": 2858 }, { "epoch": 1.54, "learning_rate": 7.036542297462264e-08, "logits/chosen": -2.2004339694976807, "logits/rejected": -2.2016103267669678, "logps/chosen": -0.9284781217575073, "logps/rejected": -3.142033815383911, "loss": 0.4607, "rewards/accuracies": 1.0, "rewards/chosen": 1.3028284311294556, "rewards/margins": 0.5357422828674316, "rewards/rejected": 0.7670861482620239, "step": 2859 }, { "epoch": 1.54, "learning_rate": 7.034547673136732e-08, "logits/chosen": -2.0209567546844482, "logits/rejected": -2.274780035018921, "logps/chosen": -0.26329636573791504, "logps/rejected": -0.3194371163845062, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 1.0432811975479126, "rewards/margins": 0.026774048805236816, "rewards/rejected": 1.0165071487426758, "step": 2860 }, { "epoch": 1.54, "learning_rate": 7.032552660707765e-08, "logits/chosen": -2.2158069610595703, "logits/rejected": -2.0945963859558105, "logps/chosen": -46.51589584350586, "logps/rejected": -9.51334285736084, "loss": 0.277, "rewards/accuracies": 1.0, "rewards/chosen": 2.141991138458252, "rewards/margins": 1.1418431997299194, "rewards/rejected": 1.0001479387283325, "step": 2861 }, { "epoch": 1.54, "learning_rate": 7.030557260555921e-08, "logits/chosen": -2.0044913291931152, "logits/rejected": -2.2635629177093506, "logps/chosen": -0.14635597169399261, "logps/rejected": -0.14648768305778503, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.9315848350524902, "rewards/margins": -0.0005559325218200684, "rewards/rejected": 0.9321407675743103, "step": 2862 }, { "epoch": 1.54, "learning_rate": 7.028561473061835e-08, "logits/chosen": -2.056739330291748, "logits/rejected": -2.3215863704681396, "logps/chosen": -9.359366416931152, "logps/rejected": -8.861757278442383, "loss": 0.7118, "rewards/accuracies": 0.0, "rewards/chosen": 0.4564511477947235, "rewards/margins": -0.03695821762084961, "rewards/rejected": 0.4934093654155731, "step": 2863 }, { "epoch": 1.54, "learning_rate": 7.026565298606225e-08, "logits/chosen": -2.0263476371765137, "logits/rejected": -2.292376756668091, "logps/chosen": -0.37342244386672974, "logps/rejected": -0.35240697860717773, "loss": 0.6641, "rewards/accuracies": 1.0, "rewards/chosen": 0.9902334213256836, "rewards/margins": 0.05889624357223511, "rewards/rejected": 0.9313371777534485, "step": 2864 }, { "epoch": 1.55, "learning_rate": 7.024568737569865e-08, "logits/chosen": -2.1215572357177734, "logits/rejected": -2.224886655807495, "logps/chosen": -6.411656379699707, "logps/rejected": -0.5717433094978333, "loss": 0.6146, "rewards/accuracies": 1.0, "rewards/chosen": 1.0266648530960083, "rewards/margins": 0.16372603178024292, "rewards/rejected": 0.8629388213157654, "step": 2865 }, { "epoch": 1.55, "learning_rate": 7.02257179033362e-08, "logits/chosen": -2.0497806072235107, "logits/rejected": -2.326047658920288, "logps/chosen": -2.539430618286133, "logps/rejected": -2.664457321166992, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.7314175963401794, "rewards/margins": 0.0187719464302063, "rewards/rejected": 0.7126456499099731, "step": 2866 }, { "epoch": 1.55, "learning_rate": 7.020574457278414e-08, "logits/chosen": -1.9942554235458374, "logits/rejected": -1.998688817024231, "logps/chosen": -2.006068468093872, "logps/rejected": -3.1570076942443848, "loss": 0.4495, "rewards/accuracies": 1.0, "rewards/chosen": 1.1558021306991577, "rewards/margins": 0.5663298964500427, "rewards/rejected": 0.589472234249115, "step": 2867 }, { "epoch": 1.55, "learning_rate": 7.018576738785257e-08, "logits/chosen": -2.0650365352630615, "logits/rejected": -2.059685230255127, "logps/chosen": -2.7798070907592773, "logps/rejected": -5.779579162597656, "loss": 0.4602, "rewards/accuracies": 1.0, "rewards/chosen": 1.2687766551971436, "rewards/margins": 0.5371308922767639, "rewards/rejected": 0.7316457629203796, "step": 2868 }, { "epoch": 1.55, "learning_rate": 7.016578635235223e-08, "logits/chosen": -2.078130006790161, "logits/rejected": -2.082742929458618, "logps/chosen": -2.51920485496521, "logps/rejected": -3.9769086837768555, "loss": 0.518, "rewards/accuracies": 1.0, "rewards/chosen": 1.0386691093444824, "rewards/margins": 0.38768470287323, "rewards/rejected": 0.6509844064712524, "step": 2869 }, { "epoch": 1.55, "learning_rate": 7.014580147009467e-08, "logits/chosen": -1.9952014684677124, "logits/rejected": -2.2591912746429443, "logps/chosen": -0.2661120295524597, "logps/rejected": -0.2895064353942871, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.8616747856140137, "rewards/margins": -0.006036281585693359, "rewards/rejected": 0.867711067199707, "step": 2870 }, { "epoch": 1.55, "learning_rate": 7.012581274489211e-08, "logits/chosen": -2.183488368988037, "logits/rejected": -2.2817435264587402, "logps/chosen": -4.6864213943481445, "logps/rejected": -0.36556190252304077, "loss": 0.7737, "rewards/accuracies": 0.0, "rewards/chosen": 0.8843697905540466, "rewards/margins": -0.15504509210586548, "rewards/rejected": 1.039414882659912, "step": 2871 }, { "epoch": 1.55, "learning_rate": 7.010582018055754e-08, "logits/chosen": -2.0573925971984863, "logits/rejected": -2.0578322410583496, "logps/chosen": -3.1529123783111572, "logps/rejected": -4.0843658447265625, "loss": 0.3332, "rewards/accuracies": 1.0, "rewards/chosen": 1.515501856803894, "rewards/margins": 0.9277282357215881, "rewards/rejected": 0.5877736210823059, "step": 2872 }, { "epoch": 1.55, "learning_rate": 7.008582378090468e-08, "logits/chosen": -2.076505184173584, "logits/rejected": -2.25577974319458, "logps/chosen": -0.3744613230228424, "logps/rejected": -0.4046038091182709, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.922823429107666, "rewards/margins": 0.024887442588806152, "rewards/rejected": 0.8979359865188599, "step": 2873 }, { "epoch": 1.55, "learning_rate": 7.006582354974797e-08, "logits/chosen": -2.052506923675537, "logits/rejected": -2.3181955814361572, "logps/chosen": -1.4904870986938477, "logps/rejected": -5.570460319519043, "loss": 0.6134, "rewards/accuracies": 1.0, "rewards/chosen": 1.0500891208648682, "rewards/margins": 0.16646254062652588, "rewards/rejected": 0.8836265802383423, "step": 2874 }, { "epoch": 1.55, "learning_rate": 7.004581949090259e-08, "logits/chosen": -2.107166051864624, "logits/rejected": -2.228614568710327, "logps/chosen": -2.1996798515319824, "logps/rejected": -0.7637534737586975, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.8655371069908142, "rewards/margins": 0.028699159622192383, "rewards/rejected": 0.8368379473686218, "step": 2875 }, { "epoch": 1.55, "learning_rate": 7.002581160818445e-08, "logits/chosen": -2.1423208713531494, "logits/rejected": -2.125107765197754, "logps/chosen": -12.229129791259766, "logps/rejected": -7.187819480895996, "loss": 0.2924, "rewards/accuracies": 1.0, "rewards/chosen": 1.3533117771148682, "rewards/margins": 1.07987380027771, "rewards/rejected": 0.2734379768371582, "step": 2876 }, { "epoch": 1.55, "learning_rate": 7.000579990541016e-08, "logits/chosen": -2.2064383029937744, "logits/rejected": -2.1729888916015625, "logps/chosen": -13.966072082519531, "logps/rejected": -2.9378397464752197, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": 1.1463993787765503, "rewards/margins": 0.3987932801246643, "rewards/rejected": 0.747606098651886, "step": 2877 }, { "epoch": 1.55, "learning_rate": 6.998578438639713e-08, "logits/chosen": -2.063807964324951, "logits/rejected": -2.3379743099212646, "logps/chosen": -0.27493855357170105, "logps/rejected": -0.3135010898113251, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.7981823086738586, "rewards/margins": 0.0018268823623657227, "rewards/rejected": 0.7963554263114929, "step": 2878 }, { "epoch": 1.55, "learning_rate": 6.996576505496343e-08, "logits/chosen": -1.9942883253097534, "logits/rejected": -2.2294790744781494, "logps/chosen": -0.5898745656013489, "logps/rejected": -0.6480022072792053, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.7969330549240112, "rewards/margins": 0.011157810688018799, "rewards/rejected": 0.7857752442359924, "step": 2879 }, { "epoch": 1.55, "learning_rate": 6.994574191492786e-08, "logits/chosen": -2.1288723945617676, "logits/rejected": -2.125880002975464, "logps/chosen": -4.580625534057617, "logps/rejected": -4.773665428161621, "loss": 0.3457, "rewards/accuracies": 1.0, "rewards/chosen": 1.276889681816101, "rewards/margins": 0.884368896484375, "rewards/rejected": 0.39252081513404846, "step": 2880 }, { "epoch": 1.55, "learning_rate": 6.992571497010999e-08, "logits/chosen": -1.963100552558899, "logits/rejected": -2.2402737140655518, "logps/chosen": -0.7622584700584412, "logps/rejected": -0.7624393105506897, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.8314377069473267, "rewards/margins": 0.02140408754348755, "rewards/rejected": 0.8100336194038391, "step": 2881 }, { "epoch": 1.55, "learning_rate": 6.99056842243301e-08, "logits/chosen": -2.079549551010132, "logits/rejected": -2.214151382446289, "logps/chosen": -2.7405285835266113, "logps/rejected": -2.347620964050293, "loss": 0.7092, "rewards/accuracies": 0.0, "rewards/chosen": 0.8450388312339783, "rewards/margins": -0.031840980052948, "rewards/rejected": 0.8768798112869263, "step": 2882 }, { "epoch": 1.56, "learning_rate": 6.988564968140916e-08, "logits/chosen": -2.2147939205169678, "logits/rejected": -2.3187127113342285, "logps/chosen": -1.6116764545440674, "logps/rejected": -1.5656890869140625, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 1.0301692485809326, "rewards/margins": 0.018058180809020996, "rewards/rejected": 1.0121110677719116, "step": 2883 }, { "epoch": 1.56, "learning_rate": 6.986561134516892e-08, "logits/chosen": -1.9899487495422363, "logits/rejected": -2.295478105545044, "logps/chosen": -0.3415667712688446, "logps/rejected": -0.4114437699317932, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.9669513702392578, "rewards/margins": 0.002048492431640625, "rewards/rejected": 0.9649028778076172, "step": 2884 }, { "epoch": 1.56, "learning_rate": 6.984556921943181e-08, "logits/chosen": -1.9406887292861938, "logits/rejected": -1.9481054544448853, "logps/chosen": -1.7023205757141113, "logps/rejected": -3.4495646953582764, "loss": 0.495, "rewards/accuracies": 1.0, "rewards/chosen": 0.9102529883384705, "rewards/margins": 0.44561609625816345, "rewards/rejected": 0.464636892080307, "step": 2885 }, { "epoch": 1.56, "learning_rate": 6.982552330802099e-08, "logits/chosen": -1.9612510204315186, "logits/rejected": -2.311884641647339, "logps/chosen": -4.329025745391846, "logps/rejected": -4.537320613861084, "loss": 0.6664, "rewards/accuracies": 1.0, "rewards/chosen": 0.8146724700927734, "rewards/margins": 0.054279446601867676, "rewards/rejected": 0.7603930234909058, "step": 2886 }, { "epoch": 1.56, "learning_rate": 6.980547361476037e-08, "logits/chosen": -2.0062599182128906, "logits/rejected": -2.2577998638153076, "logps/chosen": -0.26318079233169556, "logps/rejected": -0.2960696816444397, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.966667652130127, "rewards/margins": 0.01476210355758667, "rewards/rejected": 0.9519055485725403, "step": 2887 }, { "epoch": 1.56, "learning_rate": 6.978542014347454e-08, "logits/chosen": -2.0174601078033447, "logits/rejected": -2.007855176925659, "logps/chosen": -12.152837753295898, "logps/rejected": -2.6580185890197754, "loss": 0.6304, "rewards/accuracies": 1.0, "rewards/chosen": 0.9781059622764587, "rewards/margins": 0.12976360321044922, "rewards/rejected": 0.8483423590660095, "step": 2888 }, { "epoch": 1.56, "learning_rate": 6.976536289798885e-08, "logits/chosen": -2.210397243499756, "logits/rejected": -2.2713987827301025, "logps/chosen": -1.0763803720474243, "logps/rejected": -1.21023428440094, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 0.9166522026062012, "rewards/margins": 0.03989964723587036, "rewards/rejected": 0.8767525553703308, "step": 2889 }, { "epoch": 1.56, "learning_rate": 6.974530188212935e-08, "logits/chosen": -2.0825462341308594, "logits/rejected": -2.0858805179595947, "logps/chosen": -0.3986200988292694, "logps/rejected": -6.1494293212890625, "loss": 0.513, "rewards/accuracies": 1.0, "rewards/chosen": 0.9415984153747559, "rewards/margins": 0.4000254273414612, "rewards/rejected": 0.5415729880332947, "step": 2890 }, { "epoch": 1.56, "learning_rate": 6.972523709972281e-08, "logits/chosen": -1.9989897012710571, "logits/rejected": -2.3159265518188477, "logps/chosen": -4.8149733543396, "logps/rejected": -4.591450214385986, "loss": 0.711, "rewards/accuracies": 0.0, "rewards/chosen": 0.6921482682228088, "rewards/margins": -0.03542196750640869, "rewards/rejected": 0.7275702357292175, "step": 2891 }, { "epoch": 1.56, "learning_rate": 6.970516855459672e-08, "logits/chosen": -2.0504441261291504, "logits/rejected": -2.3016433715820312, "logps/chosen": -1.2941467761993408, "logps/rejected": -3.226874828338623, "loss": 0.6022, "rewards/accuracies": 1.0, "rewards/chosen": 1.1091976165771484, "rewards/margins": 0.19094562530517578, "rewards/rejected": 0.9182519912719727, "step": 2892 }, { "epoch": 1.56, "learning_rate": 6.968509625057927e-08, "logits/chosen": -1.9890438318252563, "logits/rejected": -1.9858521223068237, "logps/chosen": -3.324418067932129, "logps/rejected": -2.360349178314209, "loss": 0.3924, "rewards/accuracies": 1.0, "rewards/chosen": 1.4343701601028442, "rewards/margins": 0.7327511310577393, "rewards/rejected": 0.701619029045105, "step": 2893 }, { "epoch": 1.56, "learning_rate": 6.966502019149942e-08, "logits/chosen": -2.0506789684295654, "logits/rejected": -2.0567736625671387, "logps/chosen": -1.2720223665237427, "logps/rejected": -3.9894304275512695, "loss": 0.475, "rewards/accuracies": 1.0, "rewards/chosen": 0.848139226436615, "rewards/margins": 0.49743345379829407, "rewards/rejected": 0.3507057726383209, "step": 2894 }, { "epoch": 1.56, "learning_rate": 6.964494038118679e-08, "logits/chosen": -2.1342499256134033, "logits/rejected": -2.135197639465332, "logps/chosen": -2.453350782394409, "logps/rejected": -2.714482307434082, "loss": 0.6089, "rewards/accuracies": 1.0, "rewards/chosen": 0.8581892848014832, "rewards/margins": 0.1761646270751953, "rewards/rejected": 0.6820246577262878, "step": 2895 }, { "epoch": 1.56, "learning_rate": 6.962485682347173e-08, "logits/chosen": -2.0912158489227295, "logits/rejected": -2.2538130283355713, "logps/chosen": -1.2761824131011963, "logps/rejected": -1.4516676664352417, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.6994543075561523, "rewards/margins": 0.039041221141815186, "rewards/rejected": 0.6604130864143372, "step": 2896 }, { "epoch": 1.56, "learning_rate": 6.960476952218532e-08, "logits/chosen": -2.0904438495635986, "logits/rejected": -1.976423978805542, "logps/chosen": -27.140207290649414, "logps/rejected": -4.245687961578369, "loss": 0.517, "rewards/accuracies": 1.0, "rewards/chosen": 1.3857663869857788, "rewards/margins": 0.390167772769928, "rewards/rejected": 0.9955986142158508, "step": 2897 }, { "epoch": 1.56, "learning_rate": 6.958467848115935e-08, "logits/chosen": -2.0957934856414795, "logits/rejected": -2.332383632659912, "logps/chosen": -0.9888738393783569, "logps/rejected": -0.9121307730674744, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": 1.2480639219284058, "rewards/margins": 0.12951719760894775, "rewards/rejected": 1.118546724319458, "step": 2898 }, { "epoch": 1.56, "learning_rate": 6.956458370422633e-08, "logits/chosen": -2.1296424865722656, "logits/rejected": -2.158735513687134, "logps/chosen": -8.677266120910645, "logps/rejected": -17.71946144104004, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 1.2472342252731323, "rewards/margins": 0.8110296726226807, "rewards/rejected": 0.4362045228481293, "step": 2899 }, { "epoch": 1.56, "learning_rate": 6.954448519521945e-08, "logits/chosen": -2.0271153450012207, "logits/rejected": -1.973610758781433, "logps/chosen": -31.86488914489746, "logps/rejected": -2.6288368701934814, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 1.9907457828521729, "rewards/margins": 1.3743808269500732, "rewards/rejected": 0.6163648962974548, "step": 2900 }, { "epoch": 1.56, "learning_rate": 6.952438295797263e-08, "logits/chosen": -2.0075883865356445, "logits/rejected": -2.2776052951812744, "logps/chosen": -0.4275808334350586, "logps/rejected": -0.466031551361084, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": 1.0624470710754395, "rewards/margins": 0.04459333419799805, "rewards/rejected": 1.0178537368774414, "step": 2901 }, { "epoch": 1.57, "learning_rate": 6.950427699632053e-08, "logits/chosen": -2.0086798667907715, "logits/rejected": -2.0089735984802246, "logps/chosen": -1.8010305166244507, "logps/rejected": -1.933149814605713, "loss": 0.5603, "rewards/accuracies": 1.0, "rewards/chosen": 1.2488328218460083, "rewards/margins": 0.2860097289085388, "rewards/rejected": 0.9628230929374695, "step": 2902 }, { "epoch": 1.57, "learning_rate": 6.948416731409845e-08, "logits/chosen": -2.1738474369049072, "logits/rejected": -2.42490291595459, "logps/chosen": -10.94758129119873, "logps/rejected": -12.946616172790527, "loss": 0.8491, "rewards/accuracies": 0.0, "rewards/chosen": 0.9150969386100769, "rewards/margins": -0.2908638119697571, "rewards/rejected": 1.205960750579834, "step": 2903 }, { "epoch": 1.57, "learning_rate": 6.94640539151425e-08, "logits/chosen": -2.0334856510162354, "logits/rejected": -2.2342755794525146, "logps/chosen": -2.8345956802368164, "logps/rejected": -3.9972994327545166, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.98051518201828, "rewards/margins": -0.000886380672454834, "rewards/rejected": 0.9814015626907349, "step": 2904 }, { "epoch": 1.57, "learning_rate": 6.944393680328942e-08, "logits/chosen": -2.1224005222320557, "logits/rejected": -2.0675930976867676, "logps/chosen": -17.34695053100586, "logps/rejected": -3.265328884124756, "loss": 0.2984, "rewards/accuracies": 1.0, "rewards/chosen": 1.689888834953308, "rewards/margins": 1.056276559829712, "rewards/rejected": 0.633612334728241, "step": 2905 }, { "epoch": 1.57, "learning_rate": 6.942381598237668e-08, "logits/chosen": -1.9971981048583984, "logits/rejected": -2.0118277072906494, "logps/chosen": -3.2427732944488525, "logps/rejected": -7.126510143280029, "loss": 0.5233, "rewards/accuracies": 1.0, "rewards/chosen": 0.9825431704521179, "rewards/margins": 0.3746718764305115, "rewards/rejected": 0.6078712940216064, "step": 2906 }, { "epoch": 1.57, "learning_rate": 6.940369145624245e-08, "logits/chosen": -2.054047107696533, "logits/rejected": -2.3176991939544678, "logps/chosen": -1.8004416227340698, "logps/rejected": -1.9495245218276978, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 1.057962417602539, "rewards/margins": -0.012441039085388184, "rewards/rejected": 1.0704034566879272, "step": 2907 }, { "epoch": 1.57, "learning_rate": 6.938356322872564e-08, "logits/chosen": -2.128948211669922, "logits/rejected": -2.301692008972168, "logps/chosen": -1.2433314323425293, "logps/rejected": -0.9971371293067932, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 1.0625574588775635, "rewards/margins": 0.0387120246887207, "rewards/rejected": 1.0238454341888428, "step": 2908 }, { "epoch": 1.57, "learning_rate": 6.936343130366584e-08, "logits/chosen": -2.191683769226074, "logits/rejected": -2.1868996620178223, "logps/chosen": -6.395571708679199, "logps/rejected": -5.835819721221924, "loss": 0.3108, "rewards/accuracies": 1.0, "rewards/chosen": 1.3538514375686646, "rewards/margins": 1.0092657804489136, "rewards/rejected": 0.344585657119751, "step": 2909 }, { "epoch": 1.57, "learning_rate": 6.934329568490332e-08, "logits/chosen": -2.1701576709747314, "logits/rejected": -2.07875657081604, "logps/chosen": -20.66229248046875, "logps/rejected": -3.3568999767303467, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 2.330622911453247, "rewards/margins": 1.8060742616653442, "rewards/rejected": 0.5245486497879028, "step": 2910 }, { "epoch": 1.57, "learning_rate": 6.932315637627912e-08, "logits/chosen": -1.9619908332824707, "logits/rejected": -1.9610713720321655, "logps/chosen": -1.2169830799102783, "logps/rejected": -2.6163578033447266, "loss": 0.5713, "rewards/accuracies": 1.0, "rewards/chosen": 1.1279454231262207, "rewards/margins": 0.2606678605079651, "rewards/rejected": 0.8672775626182556, "step": 2911 }, { "epoch": 1.57, "learning_rate": 6.93030133816349e-08, "logits/chosen": -2.0951173305511475, "logits/rejected": -2.0476298332214355, "logps/chosen": -9.36288070678711, "logps/rejected": -5.898598670959473, "loss": 0.4073, "rewards/accuracies": 1.0, "rewards/chosen": 1.4995813369750977, "rewards/margins": 0.6877813935279846, "rewards/rejected": 0.811799943447113, "step": 2912 }, { "epoch": 1.57, "learning_rate": 6.928286670481313e-08, "logits/chosen": -2.0834219455718994, "logits/rejected": -2.082991600036621, "logps/chosen": -1.2817730903625488, "logps/rejected": -1.4225512742996216, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 0.8418886065483093, "rewards/margins": 0.05129200220108032, "rewards/rejected": 0.790596604347229, "step": 2913 }, { "epoch": 1.57, "learning_rate": 6.926271634965689e-08, "logits/chosen": -2.0039589405059814, "logits/rejected": -1.990099549293518, "logps/chosen": -6.081458568572998, "logps/rejected": -4.715532302856445, "loss": 0.3619, "rewards/accuracies": 1.0, "rewards/chosen": 1.5268484354019165, "rewards/margins": 0.8301296234130859, "rewards/rejected": 0.6967188119888306, "step": 2914 }, { "epoch": 1.57, "learning_rate": 6.924256232000997e-08, "logits/chosen": -2.1133127212524414, "logits/rejected": -2.114711046218872, "logps/chosen": -1.5955431461334229, "logps/rejected": -1.9378337860107422, "loss": 0.5275, "rewards/accuracies": 1.0, "rewards/chosen": 1.0530449151992798, "rewards/margins": 0.3643146753311157, "rewards/rejected": 0.6887302398681641, "step": 2915 }, { "epoch": 1.57, "learning_rate": 6.922240461971692e-08, "logits/chosen": -2.160818338394165, "logits/rejected": -2.352494239807129, "logps/chosen": -1.5681941509246826, "logps/rejected": -1.5502142906188965, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 1.1667267084121704, "rewards/margins": 0.033100128173828125, "rewards/rejected": 1.1336265802383423, "step": 2916 }, { "epoch": 1.57, "learning_rate": 6.920224325262294e-08, "logits/chosen": -1.9870500564575195, "logits/rejected": -1.9922518730163574, "logps/chosen": -3.4020514488220215, "logps/rejected": -2.387998580932617, "loss": 0.4644, "rewards/accuracies": 1.0, "rewards/chosen": 1.185705542564392, "rewards/margins": 0.5259641408920288, "rewards/rejected": 0.6597414016723633, "step": 2917 }, { "epoch": 1.57, "learning_rate": 6.918207822257393e-08, "logits/chosen": -2.12602162361145, "logits/rejected": -2.1006243228912354, "logps/chosen": -14.75124740600586, "logps/rejected": -1.2198199033737183, "loss": 0.3891, "rewards/accuracies": 1.0, "rewards/chosen": 1.7178621292114258, "rewards/margins": 0.7432051301002502, "rewards/rejected": 0.9746569991111755, "step": 2918 }, { "epoch": 1.57, "learning_rate": 6.916190953341655e-08, "logits/chosen": -2.0963120460510254, "logits/rejected": -2.293856143951416, "logps/chosen": -0.450176864862442, "logps/rejected": -0.5009154081344604, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.8824701309204102, "rewards/margins": 0.021374285221099854, "rewards/rejected": 0.8610958456993103, "step": 2919 }, { "epoch": 1.57, "learning_rate": 6.914173718899805e-08, "logits/chosen": -2.094696521759033, "logits/rejected": -2.1007814407348633, "logps/chosen": -3.6504671573638916, "logps/rejected": -3.6422345638275146, "loss": 0.5273, "rewards/accuracies": 1.0, "rewards/chosen": 1.1280848979949951, "rewards/margins": 0.3646857738494873, "rewards/rejected": 0.7633991241455078, "step": 2920 }, { "epoch": 1.58, "learning_rate": 6.912156119316648e-08, "logits/chosen": -2.110004425048828, "logits/rejected": -2.112632989883423, "logps/chosen": -4.024006366729736, "logps/rejected": -0.34981152415275574, "loss": 0.603, "rewards/accuracies": 1.0, "rewards/chosen": 1.2252899408340454, "rewards/margins": 0.18914639949798584, "rewards/rejected": 1.0361435413360596, "step": 2921 }, { "epoch": 1.58, "learning_rate": 6.910138154977054e-08, "logits/chosen": -2.035318613052368, "logits/rejected": -2.0326082706451416, "logps/chosen": -1.209370493888855, "logps/rejected": -4.807865619659424, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 0.9181479811668396, "rewards/margins": 0.5607856512069702, "rewards/rejected": 0.3573623299598694, "step": 2922 }, { "epoch": 1.58, "learning_rate": 6.908119826265963e-08, "logits/chosen": -2.0415894985198975, "logits/rejected": -2.0299904346466064, "logps/chosen": -5.126962661743164, "logps/rejected": -2.278846502304077, "loss": 0.5791, "rewards/accuracies": 1.0, "rewards/chosen": 0.8421909213066101, "rewards/margins": 0.24283623695373535, "rewards/rejected": 0.5993546843528748, "step": 2923 }, { "epoch": 1.58, "learning_rate": 6.906101133568385e-08, "logits/chosen": -2.0661230087280273, "logits/rejected": -2.06703782081604, "logps/chosen": -0.4367629289627075, "logps/rejected": -3.7249369621276855, "loss": 0.4954, "rewards/accuracies": 1.0, "rewards/chosen": 1.060693621635437, "rewards/margins": 0.44453203678131104, "rewards/rejected": 0.616161584854126, "step": 2924 }, { "epoch": 1.58, "learning_rate": 6.904082077269396e-08, "logits/chosen": -2.0294647216796875, "logits/rejected": -2.272183418273926, "logps/chosen": -2.546380043029785, "logps/rejected": -2.2674176692962646, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.6941423416137695, "rewards/margins": 0.0033666491508483887, "rewards/rejected": 0.6907756924629211, "step": 2925 }, { "epoch": 1.58, "learning_rate": 6.902062657754145e-08, "logits/chosen": -1.9891835451126099, "logits/rejected": -1.9883135557174683, "logps/chosen": -2.8164870738983154, "logps/rejected": -3.1775360107421875, "loss": 0.4258, "rewards/accuracies": 1.0, "rewards/chosen": 1.1924275159835815, "rewards/margins": 0.6334211826324463, "rewards/rejected": 0.5590063333511353, "step": 2926 }, { "epoch": 1.58, "learning_rate": 6.900042875407853e-08, "logits/chosen": -2.0943198204040527, "logits/rejected": -2.300555944442749, "logps/chosen": -3.672419548034668, "logps/rejected": -1.8045426607131958, "loss": 0.7413, "rewards/accuracies": 0.0, "rewards/chosen": 0.9029543995857239, "rewards/margins": -0.09409809112548828, "rewards/rejected": 0.9970524907112122, "step": 2927 }, { "epoch": 1.58, "learning_rate": 6.898022730615805e-08, "logits/chosen": -2.102566719055176, "logits/rejected": -2.078644037246704, "logps/chosen": -13.300243377685547, "logps/rejected": -2.107086181640625, "loss": 0.4076, "rewards/accuracies": 1.0, "rewards/chosen": 1.3859227895736694, "rewards/margins": 0.6867005228996277, "rewards/rejected": 0.6992222666740417, "step": 2928 }, { "epoch": 1.58, "learning_rate": 6.896002223763356e-08, "logits/chosen": -2.1160478591918945, "logits/rejected": -2.3076815605163574, "logps/chosen": -3.7856574058532715, "logps/rejected": -3.8296597003936768, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.5233164429664612, "rewards/margins": 0.02266794443130493, "rewards/rejected": 0.5006484985351562, "step": 2929 }, { "epoch": 1.58, "learning_rate": 6.893981355235932e-08, "logits/chosen": -2.1213529109954834, "logits/rejected": -2.079659938812256, "logps/chosen": -20.071346282958984, "logps/rejected": -8.802438735961914, "loss": 0.3817, "rewards/accuracies": 1.0, "rewards/chosen": 1.4946292638778687, "rewards/margins": 0.7663609981536865, "rewards/rejected": 0.7282682657241821, "step": 2930 }, { "epoch": 1.58, "learning_rate": 6.891960125419028e-08, "logits/chosen": -2.1270413398742676, "logits/rejected": -2.3084557056427, "logps/chosen": -1.5622690916061401, "logps/rejected": -2.9645681381225586, "loss": 0.6108, "rewards/accuracies": 1.0, "rewards/chosen": 0.8992994427680969, "rewards/margins": 0.17210471630096436, "rewards/rejected": 0.7271947264671326, "step": 2931 }, { "epoch": 1.58, "learning_rate": 6.889938534698204e-08, "logits/chosen": -2.083934783935547, "logits/rejected": -2.336432695388794, "logps/chosen": -7.655421257019043, "logps/rejected": -6.011802673339844, "loss": 0.8377, "rewards/accuracies": 0.0, "rewards/chosen": 0.3607163429260254, "rewards/margins": -0.27089589834213257, "rewards/rejected": 0.631612241268158, "step": 2932 }, { "epoch": 1.58, "learning_rate": 6.887916583459094e-08, "logits/chosen": -2.073084592819214, "logits/rejected": -2.272707223892212, "logps/chosen": -0.3542276918888092, "logps/rejected": -0.36424607038497925, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.8664543032646179, "rewards/margins": 0.007402479648590088, "rewards/rejected": 0.8590518236160278, "step": 2933 }, { "epoch": 1.58, "learning_rate": 6.8858942720874e-08, "logits/chosen": -2.1333656311035156, "logits/rejected": -2.2917022705078125, "logps/chosen": -0.6007248759269714, "logps/rejected": -0.5518990159034729, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.9738451242446899, "rewards/margins": 0.03294575214385986, "rewards/rejected": 0.9408993721008301, "step": 2934 }, { "epoch": 1.58, "learning_rate": 6.883871600968889e-08, "logits/chosen": -2.195495367050171, "logits/rejected": -2.20310115814209, "logps/chosen": -1.80887770652771, "logps/rejected": -4.4962239265441895, "loss": 0.3381, "rewards/accuracies": 1.0, "rewards/chosen": 1.3384814262390137, "rewards/margins": 0.9106235504150391, "rewards/rejected": 0.4278578460216522, "step": 2935 }, { "epoch": 1.58, "learning_rate": 6.881848570489398e-08, "logits/chosen": -2.0241007804870605, "logits/rejected": -2.01635479927063, "logps/chosen": -4.633172035217285, "logps/rejected": -3.5410702228546143, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 1.6010658740997314, "rewards/margins": 1.205877423286438, "rewards/rejected": 0.39518845081329346, "step": 2936 }, { "epoch": 1.58, "learning_rate": 6.879825181034836e-08, "logits/chosen": -2.2137207984924316, "logits/rejected": -2.209751605987549, "logps/chosen": -0.2393234670162201, "logps/rejected": -10.93803596496582, "loss": 0.3634, "rewards/accuracies": 1.0, "rewards/chosen": 0.9348048567771912, "rewards/margins": 0.825074315071106, "rewards/rejected": 0.10973053425550461, "step": 2937 }, { "epoch": 1.58, "learning_rate": 6.877801432991178e-08, "logits/chosen": -2.1246166229248047, "logits/rejected": -2.2903428077697754, "logps/chosen": -0.8764517903327942, "logps/rejected": -0.8993846774101257, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 1.07545006275177, "rewards/margins": 0.02226245403289795, "rewards/rejected": 1.053187608718872, "step": 2938 }, { "epoch": 1.59, "learning_rate": 6.875777326744465e-08, "logits/chosen": -2.0771210193634033, "logits/rejected": -2.2000157833099365, "logps/chosen": -0.4491332173347473, "logps/rejected": -0.4978480935096741, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.8969714045524597, "rewards/margins": 0.02470499277114868, "rewards/rejected": 0.872266411781311, "step": 2939 }, { "epoch": 1.59, "learning_rate": 6.873752862680811e-08, "logits/chosen": -2.0532734394073486, "logits/rejected": -2.25183367729187, "logps/chosen": -0.10591176152229309, "logps/rejected": -0.1041766107082367, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8661012649536133, "rewards/margins": 0.02271050214767456, "rewards/rejected": 0.8433907628059387, "step": 2940 }, { "epoch": 1.59, "learning_rate": 6.871728041186394e-08, "logits/chosen": -2.151681900024414, "logits/rejected": -2.2792744636535645, "logps/chosen": -4.629635334014893, "logps/rejected": -27.42698097229004, "loss": 0.3479, "rewards/accuracies": 1.0, "rewards/chosen": 1.2585190534591675, "rewards/margins": 0.8767687082290649, "rewards/rejected": 0.38175031542778015, "step": 2941 }, { "epoch": 1.59, "learning_rate": 6.869702862647461e-08, "logits/chosen": -2.135962724685669, "logits/rejected": -2.135603904724121, "logps/chosen": -1.2506608963012695, "logps/rejected": -3.245189905166626, "loss": 0.4648, "rewards/accuracies": 1.0, "rewards/chosen": 1.1077033281326294, "rewards/margins": 0.5248532295227051, "rewards/rejected": 0.5828500986099243, "step": 2942 }, { "epoch": 1.59, "learning_rate": 6.867677327450332e-08, "logits/chosen": -1.9586924314498901, "logits/rejected": -2.2389132976531982, "logps/chosen": -1.346604824066162, "logps/rejected": -1.3465118408203125, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8872987627983093, "rewards/margins": 0.019540011882781982, "rewards/rejected": 0.8677587509155273, "step": 2943 }, { "epoch": 1.59, "learning_rate": 6.865651435981389e-08, "logits/chosen": -2.103741407394409, "logits/rejected": -2.1068785190582275, "logps/chosen": -0.9757202863693237, "logps/rejected": -1.6163709163665771, "loss": 0.5457, "rewards/accuracies": 1.0, "rewards/chosen": 1.0728293657302856, "rewards/margins": 0.3205767869949341, "rewards/rejected": 0.7522525787353516, "step": 2944 }, { "epoch": 1.59, "learning_rate": 6.863625188627085e-08, "logits/chosen": -2.0394577980041504, "logits/rejected": -2.047954559326172, "logps/chosen": -2.374858856201172, "logps/rejected": -2.2497503757476807, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 1.2299448251724243, "rewards/margins": 0.5263866782188416, "rewards/rejected": 0.7035581469535828, "step": 2945 }, { "epoch": 1.59, "learning_rate": 6.86159858577394e-08, "logits/chosen": -2.0139718055725098, "logits/rejected": -2.284090518951416, "logps/chosen": -11.081603050231934, "logps/rejected": -12.558622360229492, "loss": 0.7036, "rewards/accuracies": 0.0, "rewards/chosen": 1.0200940370559692, "rewards/margins": -0.020719289779663086, "rewards/rejected": 1.0408133268356323, "step": 2946 }, { "epoch": 1.59, "learning_rate": 6.859571627808542e-08, "logits/chosen": -2.1092686653137207, "logits/rejected": -2.304840326309204, "logps/chosen": -4.489782333374023, "logps/rejected": -2.7301363945007324, "loss": 0.709, "rewards/accuracies": 0.0, "rewards/chosen": 0.6427616477012634, "rewards/margins": -0.03153061866760254, "rewards/rejected": 0.674292266368866, "step": 2947 }, { "epoch": 1.59, "learning_rate": 6.857544315117543e-08, "logits/chosen": -2.03530216217041, "logits/rejected": -2.033674478530884, "logps/chosen": -0.8022577166557312, "logps/rejected": -1.2933547496795654, "loss": 0.6177, "rewards/accuracies": 1.0, "rewards/chosen": 0.9930265545845032, "rewards/margins": 0.15713626146316528, "rewards/rejected": 0.8358902931213379, "step": 2948 }, { "epoch": 1.59, "learning_rate": 6.855516648087671e-08, "logits/chosen": -2.043996810913086, "logits/rejected": -2.2904586791992188, "logps/chosen": -0.39767757058143616, "logps/rejected": -0.38218048214912415, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.7967391014099121, "rewards/margins": 0.01818525791168213, "rewards/rejected": 0.77855384349823, "step": 2949 }, { "epoch": 1.59, "learning_rate": 6.853488627105717e-08, "logits/chosen": -2.2052783966064453, "logits/rejected": -2.074582576751709, "logps/chosen": -55.318939208984375, "logps/rejected": -4.670388698577881, "loss": 0.1737, "rewards/accuracies": 1.0, "rewards/chosen": 2.3594062328338623, "rewards/margins": 1.662445068359375, "rewards/rejected": 0.6969611644744873, "step": 2950 }, { "epoch": 1.59, "learning_rate": 6.851460252558537e-08, "logits/chosen": -2.072537660598755, "logits/rejected": -2.334070920944214, "logps/chosen": -13.65573501586914, "logps/rejected": -15.455184936523438, "loss": 0.5947, "rewards/accuracies": 1.0, "rewards/chosen": 0.934407651424408, "rewards/margins": 0.20771050453186035, "rewards/rejected": 0.7266971468925476, "step": 2951 }, { "epoch": 1.59, "learning_rate": 6.849431524833059e-08, "logits/chosen": -1.9980560541152954, "logits/rejected": -1.9896595478057861, "logps/chosen": -5.422938346862793, "logps/rejected": -4.611624717712402, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": 1.538500189781189, "rewards/margins": 1.0028560161590576, "rewards/rejected": 0.5356441736221313, "step": 2952 }, { "epoch": 1.59, "learning_rate": 6.847402444316273e-08, "logits/chosen": -2.0874199867248535, "logits/rejected": -2.0845978260040283, "logps/chosen": -2.37245774269104, "logps/rejected": -5.535495758056641, "loss": 0.2863, "rewards/accuracies": 1.0, "rewards/chosen": 1.5389364957809448, "rewards/margins": 1.1042571067810059, "rewards/rejected": 0.4346793293952942, "step": 2953 }, { "epoch": 1.59, "learning_rate": 6.845373011395241e-08, "logits/chosen": -2.140294075012207, "logits/rejected": -2.1180436611175537, "logps/chosen": -9.490041732788086, "logps/rejected": -4.550545692443848, "loss": 0.3614, "rewards/accuracies": 1.0, "rewards/chosen": 1.328987956047058, "rewards/margins": 0.8315654993057251, "rewards/rejected": 0.4974224269390106, "step": 2954 }, { "epoch": 1.59, "learning_rate": 6.843343226457093e-08, "logits/chosen": -2.1162948608398438, "logits/rejected": -2.2158548831939697, "logps/chosen": -0.6916826367378235, "logps/rejected": -0.8340545296669006, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.6867492198944092, "rewards/margins": 0.045199453830718994, "rewards/rejected": 0.6415497660636902, "step": 2955 }, { "epoch": 1.59, "learning_rate": 6.841313089889022e-08, "logits/chosen": -2.121558666229248, "logits/rejected": -2.317225933074951, "logps/chosen": -0.3368481993675232, "logps/rejected": -0.33544379472732544, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.855427086353302, "rewards/margins": 0.020858168601989746, "rewards/rejected": 0.8345689177513123, "step": 2956 }, { "epoch": 1.59, "learning_rate": 6.83928260207829e-08, "logits/chosen": -2.1418302059173584, "logits/rejected": -2.2182040214538574, "logps/chosen": -7.01489782333374, "logps/rejected": -5.835204601287842, "loss": 0.7031, "rewards/accuracies": 0.0, "rewards/chosen": 0.7389704585075378, "rewards/margins": -0.019901812076568604, "rewards/rejected": 0.7588722705841064, "step": 2957 }, { "epoch": 1.6, "learning_rate": 6.837251763412226e-08, "logits/chosen": -2.1133389472961426, "logits/rejected": -2.0419201850891113, "logps/chosen": -20.773250579833984, "logps/rejected": -2.347783327102661, "loss": 0.3742, "rewards/accuracies": 1.0, "rewards/chosen": 1.4413856267929077, "rewards/margins": 0.7900437116622925, "rewards/rejected": 0.6513419151306152, "step": 2958 }, { "epoch": 1.6, "learning_rate": 6.835220574278225e-08, "logits/chosen": -2.141563892364502, "logits/rejected": -2.1413168907165527, "logps/chosen": -4.402714252471924, "logps/rejected": -2.2216897010803223, "loss": 0.2866, "rewards/accuracies": 1.0, "rewards/chosen": 1.6645539999008179, "rewards/margins": 1.1029683351516724, "rewards/rejected": 0.5615856647491455, "step": 2959 }, { "epoch": 1.6, "learning_rate": 6.833189035063754e-08, "logits/chosen": -1.937953233718872, "logits/rejected": -1.9432438611984253, "logps/chosen": -1.4233731031417847, "logps/rejected": -5.314246654510498, "loss": 0.425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9800364375114441, "rewards/margins": 0.6356372833251953, "rewards/rejected": 0.3443991243839264, "step": 2960 }, { "epoch": 1.6, "learning_rate": 6.831157146156338e-08, "logits/chosen": -2.068176746368408, "logits/rejected": -2.2493479251861572, "logps/chosen": -0.4396013021469116, "logps/rejected": -0.5217398405075073, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 1.0318979024887085, "rewards/margins": -0.0038645267486572266, "rewards/rejected": 1.0357624292373657, "step": 2961 }, { "epoch": 1.6, "learning_rate": 6.829124907943573e-08, "logits/chosen": -2.0899312496185303, "logits/rejected": -2.093362331390381, "logps/chosen": -2.5553016662597656, "logps/rejected": -5.475879669189453, "loss": 0.3196, "rewards/accuracies": 1.0, "rewards/chosen": 1.5165061950683594, "rewards/margins": 0.9768014550209045, "rewards/rejected": 0.5397047400474548, "step": 2962 }, { "epoch": 1.6, "learning_rate": 6.827092320813126e-08, "logits/chosen": -2.035651206970215, "logits/rejected": -2.2781789302825928, "logps/chosen": -0.9878937602043152, "logps/rejected": -0.9336382746696472, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.9103280305862427, "rewards/margins": 0.03125375509262085, "rewards/rejected": 0.8790742754936218, "step": 2963 }, { "epoch": 1.6, "learning_rate": 6.825059385152721e-08, "logits/chosen": -2.054534435272217, "logits/rejected": -2.055152177810669, "logps/chosen": -1.7037028074264526, "logps/rejected": -0.764205276966095, "loss": 0.3738, "rewards/accuracies": 1.0, "rewards/chosen": 1.5284265279769897, "rewards/margins": 0.7912829518318176, "rewards/rejected": 0.7371435761451721, "step": 2964 }, { "epoch": 1.6, "learning_rate": 6.823026101350159e-08, "logits/chosen": -1.9279462099075317, "logits/rejected": -1.9320471286773682, "logps/chosen": -1.5114727020263672, "logps/rejected": -2.643747568130493, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 1.0480960607528687, "rewards/margins": 0.3625869154930115, "rewards/rejected": 0.6855091452598572, "step": 2965 }, { "epoch": 1.6, "learning_rate": 6.820992469793301e-08, "logits/chosen": -2.0323474407196045, "logits/rejected": -2.031229257583618, "logps/chosen": -0.3127687871456146, "logps/rejected": -2.5446183681488037, "loss": 0.5434, "rewards/accuracies": 1.0, "rewards/chosen": 0.8785296678543091, "rewards/margins": 0.3260002136230469, "rewards/rejected": 0.5525294542312622, "step": 2966 }, { "epoch": 1.6, "learning_rate": 6.818958490870073e-08, "logits/chosen": -2.130023241043091, "logits/rejected": -2.048532009124756, "logps/chosen": -7.210224151611328, "logps/rejected": -1.7618826627731323, "loss": 0.4058, "rewards/accuracies": 1.0, "rewards/chosen": 1.5833555459976196, "rewards/margins": 0.6920549273490906, "rewards/rejected": 0.891300618648529, "step": 2967 }, { "epoch": 1.6, "learning_rate": 6.816924164968473e-08, "logits/chosen": -2.1147797107696533, "logits/rejected": -2.146261215209961, "logps/chosen": -2.7862987518310547, "logps/rejected": -23.18529510498047, "loss": 0.6063, "rewards/accuracies": 1.0, "rewards/chosen": 1.082733154296875, "rewards/margins": 0.18201333284378052, "rewards/rejected": 0.9007198214530945, "step": 2968 }, { "epoch": 1.6, "learning_rate": 6.814889492476562e-08, "logits/chosen": -2.0449323654174805, "logits/rejected": -2.2651615142822266, "logps/chosen": -0.3652591109275818, "logps/rejected": -0.36207708716392517, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": 0.8582157492637634, "rewards/margins": -0.009273409843444824, "rewards/rejected": 0.8674891591072083, "step": 2969 }, { "epoch": 1.6, "learning_rate": 6.812854473782466e-08, "logits/chosen": -1.9646632671356201, "logits/rejected": -2.2495627403259277, "logps/chosen": -2.0681450366973877, "logps/rejected": -2.068899393081665, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.7253789305686951, "rewards/margins": 0.029238998889923096, "rewards/rejected": 0.696139931678772, "step": 2970 }, { "epoch": 1.6, "learning_rate": 6.810819109274378e-08, "logits/chosen": -2.02522349357605, "logits/rejected": -2.2735559940338135, "logps/chosen": -8.162041664123535, "logps/rejected": -6.7826762199401855, "loss": 0.744, "rewards/accuracies": 0.0, "rewards/chosen": 0.5121205449104309, "rewards/margins": -0.09920132160186768, "rewards/rejected": 0.6113218665122986, "step": 2971 }, { "epoch": 1.6, "learning_rate": 6.808783399340557e-08, "logits/chosen": -1.9884306192398071, "logits/rejected": -1.96413254737854, "logps/chosen": -13.088045120239258, "logps/rejected": -6.474309921264648, "loss": 0.3953, "rewards/accuracies": 1.0, "rewards/chosen": 1.4955949783325195, "rewards/margins": 0.7239362597465515, "rewards/rejected": 0.771658718585968, "step": 2972 }, { "epoch": 1.6, "learning_rate": 6.80674734436933e-08, "logits/chosen": -2.011117458343506, "logits/rejected": -2.260199785232544, "logps/chosen": -0.31950151920318604, "logps/rejected": -0.335260808467865, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.9860585331916809, "rewards/margins": -0.00030243396759033203, "rewards/rejected": 0.9863609671592712, "step": 2973 }, { "epoch": 1.6, "learning_rate": 6.804710944749087e-08, "logits/chosen": -2.105848789215088, "logits/rejected": -2.0826809406280518, "logps/chosen": -16.409029006958008, "logps/rejected": -3.5090529918670654, "loss": 0.2163, "rewards/accuracies": 1.0, "rewards/chosen": 2.168853521347046, "rewards/margins": 1.4207963943481445, "rewards/rejected": 0.7480570673942566, "step": 2974 }, { "epoch": 1.6, "learning_rate": 6.802674200868285e-08, "logits/chosen": -2.2125918865203857, "logits/rejected": -2.0991716384887695, "logps/chosen": -18.461280822753906, "logps/rejected": -13.849894523620605, "loss": 0.513, "rewards/accuracies": 1.0, "rewards/chosen": 1.4472969770431519, "rewards/margins": 0.4001023769378662, "rewards/rejected": 1.0471946001052856, "step": 2975 }, { "epoch": 1.61, "learning_rate": 6.800637113115447e-08, "logits/chosen": -2.0449464321136475, "logits/rejected": -2.049640417098999, "logps/chosen": -2.8323733806610107, "logps/rejected": -5.027156829833984, "loss": 0.4395, "rewards/accuracies": 1.0, "rewards/chosen": 1.0704296827316284, "rewards/margins": 0.5942073464393616, "rewards/rejected": 0.47622233629226685, "step": 2976 }, { "epoch": 1.61, "learning_rate": 6.798599681879157e-08, "logits/chosen": -2.094120740890503, "logits/rejected": -2.1011765003204346, "logps/chosen": -3.2604305744171143, "logps/rejected": -6.8872575759887695, "loss": 0.4117, "rewards/accuracies": 1.0, "rewards/chosen": 1.5382755994796753, "rewards/margins": 0.6744464635848999, "rewards/rejected": 0.8638291358947754, "step": 2977 }, { "epoch": 1.61, "learning_rate": 6.796561907548073e-08, "logits/chosen": -2.1459896564483643, "logits/rejected": -2.3552980422973633, "logps/chosen": -11.22579574584961, "logps/rejected": -11.919950485229492, "loss": 0.636, "rewards/accuracies": 1.0, "rewards/chosen": 1.097390055656433, "rewards/margins": 0.11774927377700806, "rewards/rejected": 0.979640781879425, "step": 2978 }, { "epoch": 1.61, "learning_rate": 6.794523790510913e-08, "logits/chosen": -2.1049082279205322, "logits/rejected": -2.116161346435547, "logps/chosen": -4.373945236206055, "logps/rejected": -7.385267734527588, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": 1.4925050735473633, "rewards/margins": 1.0019311904907227, "rewards/rejected": 0.49057385325431824, "step": 2979 }, { "epoch": 1.61, "learning_rate": 6.79248533115646e-08, "logits/chosen": -2.1322402954101562, "logits/rejected": -2.1234824657440186, "logps/chosen": -14.442270278930664, "logps/rejected": -11.768714904785156, "loss": 0.4339, "rewards/accuracies": 1.0, "rewards/chosen": 1.308754801750183, "rewards/margins": 0.6101027131080627, "rewards/rejected": 0.6986520886421204, "step": 2980 }, { "epoch": 1.61, "learning_rate": 6.790446529873563e-08, "logits/chosen": -2.207026481628418, "logits/rejected": -2.2022078037261963, "logps/chosen": -3.0110979080200195, "logps/rejected": -5.288907527923584, "loss": 0.4659, "rewards/accuracies": 1.0, "rewards/chosen": 1.0058183670043945, "rewards/margins": 0.5217005014419556, "rewards/rejected": 0.4841178357601166, "step": 2981 }, { "epoch": 1.61, "learning_rate": 6.788407387051142e-08, "logits/chosen": -2.0088131427764893, "logits/rejected": -2.0182459354400635, "logps/chosen": -1.6277239322662354, "logps/rejected": -2.90617299079895, "loss": 0.5474, "rewards/accuracies": 1.0, "rewards/chosen": 1.0104650259017944, "rewards/margins": 0.3163914084434509, "rewards/rejected": 0.6940736174583435, "step": 2982 }, { "epoch": 1.61, "learning_rate": 6.786367903078171e-08, "logits/chosen": -2.1586966514587402, "logits/rejected": -2.2688493728637695, "logps/chosen": -0.8852238655090332, "logps/rejected": -0.9449244141578674, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 1.0499041080474854, "rewards/margins": 0.006457686424255371, "rewards/rejected": 1.04344642162323, "step": 2983 }, { "epoch": 1.61, "learning_rate": 6.784328078343698e-08, "logits/chosen": -2.2103679180145264, "logits/rejected": -2.121723175048828, "logps/chosen": -21.967735290527344, "logps/rejected": -1.9005062580108643, "loss": 0.203, "rewards/accuracies": 1.0, "rewards/chosen": 2.1865108013153076, "rewards/margins": 1.4914259910583496, "rewards/rejected": 0.6950848698616028, "step": 2984 }, { "epoch": 1.61, "learning_rate": 6.782287913236832e-08, "logits/chosen": -2.1546642780303955, "logits/rejected": -2.125359296798706, "logps/chosen": -26.647695541381836, "logps/rejected": -5.089963912963867, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 1.3487695455551147, "rewards/margins": 0.27587640285491943, "rewards/rejected": 1.0728931427001953, "step": 2985 }, { "epoch": 1.61, "learning_rate": 6.780247408146748e-08, "logits/chosen": -1.996747374534607, "logits/rejected": -2.2678933143615723, "logps/chosen": -1.5891504287719727, "logps/rejected": -1.4813792705535889, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.6772453188896179, "rewards/margins": 0.011798441410064697, "rewards/rejected": 0.6654468774795532, "step": 2986 }, { "epoch": 1.61, "learning_rate": 6.778206563462686e-08, "logits/chosen": -2.143159866333008, "logits/rejected": -2.4274420738220215, "logps/chosen": -10.290882110595703, "logps/rejected": -13.813944816589355, "loss": 0.7229, "rewards/accuracies": 0.0, "rewards/chosen": 0.9282554984092712, "rewards/margins": -0.05857664346694946, "rewards/rejected": 0.9868321418762207, "step": 2987 }, { "epoch": 1.61, "learning_rate": 6.776165379573951e-08, "logits/chosen": -2.062159776687622, "logits/rejected": -2.0648505687713623, "logps/chosen": -2.422715425491333, "logps/rejected": -1.3438528776168823, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": 1.3517711162567139, "rewards/margins": 0.4636737108230591, "rewards/rejected": 0.8880974054336548, "step": 2988 }, { "epoch": 1.61, "learning_rate": 6.774123856869914e-08, "logits/chosen": -2.131298542022705, "logits/rejected": -2.293562650680542, "logps/chosen": -0.6626931428909302, "logps/rejected": -0.7208653092384338, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 1.0244324207305908, "rewards/margins": 0.028024017810821533, "rewards/rejected": 0.9964084029197693, "step": 2989 }, { "epoch": 1.61, "learning_rate": 6.772081995740005e-08, "logits/chosen": -2.184828758239746, "logits/rejected": -2.337022542953491, "logps/chosen": -1.490134596824646, "logps/rejected": -1.4930922985076904, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.7811937928199768, "rewards/margins": -0.0031205415725708008, "rewards/rejected": 0.7843143343925476, "step": 2990 }, { "epoch": 1.61, "learning_rate": 6.770039796573726e-08, "logits/chosen": -1.9821593761444092, "logits/rejected": -2.2512168884277344, "logps/chosen": -0.5008065104484558, "logps/rejected": -0.5307439565658569, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.880157470703125, "rewards/margins": 0.0049002766609191895, "rewards/rejected": 0.8752571940422058, "step": 2991 }, { "epoch": 1.61, "learning_rate": 6.767997259760637e-08, "logits/chosen": -2.070038080215454, "logits/rejected": -2.0613086223602295, "logps/chosen": -9.882706642150879, "logps/rejected": -1.9346424341201782, "loss": 0.6747, "rewards/accuracies": 1.0, "rewards/chosen": 1.114616870880127, "rewards/margins": 0.03717672824859619, "rewards/rejected": 1.0774401426315308, "step": 2992 }, { "epoch": 1.61, "learning_rate": 6.76595438569037e-08, "logits/chosen": -2.0561161041259766, "logits/rejected": -2.2582974433898926, "logps/chosen": -1.0786234140396118, "logps/rejected": -0.9503090381622314, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8886181712150574, "rewards/margins": 0.011000216007232666, "rewards/rejected": 0.8776179552078247, "step": 2993 }, { "epoch": 1.61, "learning_rate": 6.76391117475261e-08, "logits/chosen": -2.0686347484588623, "logits/rejected": -2.237273693084717, "logps/chosen": -0.5286461114883423, "logps/rejected": -0.5334092974662781, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0239742994308472, "rewards/margins": 0.013570308685302734, "rewards/rejected": 1.0104039907455444, "step": 2994 }, { "epoch": 1.62, "learning_rate": 6.76186762733712e-08, "logits/chosen": -2.10271954536438, "logits/rejected": -2.1355173587799072, "logps/chosen": -3.990431785583496, "logps/rejected": -13.955865859985352, "loss": 0.4463, "rewards/accuracies": 1.0, "rewards/chosen": 1.2465518712997437, "rewards/margins": 0.5754235982894897, "rewards/rejected": 0.6711282730102539, "step": 2995 }, { "epoch": 1.62, "learning_rate": 6.759823743833716e-08, "logits/chosen": -2.068272829055786, "logits/rejected": -2.234246015548706, "logps/chosen": -0.7668761610984802, "logps/rejected": -0.8622477054595947, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.7392317056655884, "rewards/margins": 0.030274033546447754, "rewards/rejected": 0.7089576721191406, "step": 2996 }, { "epoch": 1.62, "learning_rate": 6.757779524632283e-08, "logits/chosen": -2.0373828411102295, "logits/rejected": -2.0439298152923584, "logps/chosen": -5.027646541595459, "logps/rejected": -3.3042752742767334, "loss": 0.3949, "rewards/accuracies": 1.0, "rewards/chosen": 1.3921838998794556, "rewards/margins": 0.7252870798110962, "rewards/rejected": 0.6668968200683594, "step": 2997 }, { "epoch": 1.62, "learning_rate": 6.75573497012277e-08, "logits/chosen": -2.0219759941101074, "logits/rejected": -2.0290677547454834, "logps/chosen": -1.8040709495544434, "logps/rejected": -4.598864555358887, "loss": 0.4324, "rewards/accuracies": 1.0, "rewards/chosen": 0.9894819259643555, "rewards/margins": 0.6143486499786377, "rewards/rejected": 0.3751332461833954, "step": 2998 }, { "epoch": 1.62, "learning_rate": 6.75369008069519e-08, "logits/chosen": -2.07007098197937, "logits/rejected": -2.0695502758026123, "logps/chosen": -6.931251525878906, "logps/rejected": -5.798139572143555, "loss": 0.6563, "rewards/accuracies": 1.0, "rewards/chosen": 1.1366230249404907, "rewards/margins": 0.07501506805419922, "rewards/rejected": 1.0616079568862915, "step": 2999 }, { "epoch": 1.62, "learning_rate": 6.751644856739618e-08, "logits/chosen": -2.1242308616638184, "logits/rejected": -2.103585720062256, "logps/chosen": -12.625927925109863, "logps/rejected": -5.163229465484619, "loss": 0.5363, "rewards/accuracies": 1.0, "rewards/chosen": 1.0539668798446655, "rewards/margins": 0.3428388833999634, "rewards/rejected": 0.7111279964447021, "step": 3000 }, { "epoch": 1.62, "learning_rate": 6.749599298646194e-08, "logits/chosen": -2.0541906356811523, "logits/rejected": -2.0426676273345947, "logps/chosen": -6.414521217346191, "logps/rejected": -2.663579225540161, "loss": 0.4239, "rewards/accuracies": 1.0, "rewards/chosen": 1.4663035869598389, "rewards/margins": 0.6388565301895142, "rewards/rejected": 0.8274470567703247, "step": 3001 }, { "epoch": 1.62, "learning_rate": 6.747553406805121e-08, "logits/chosen": -1.9746586084365845, "logits/rejected": -2.2651760578155518, "logps/chosen": -3.3953170776367188, "logps/rejected": -3.3113956451416016, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 1.0773407220840454, "rewards/margins": 0.020690202713012695, "rewards/rejected": 1.0566505193710327, "step": 3002 }, { "epoch": 1.62, "learning_rate": 6.745507181606668e-08, "logits/chosen": -2.040325164794922, "logits/rejected": -2.026095390319824, "logps/chosen": -6.377098560333252, "logps/rejected": -0.6662989258766174, "loss": 0.388, "rewards/accuracies": 1.0, "rewards/chosen": 1.583152413368225, "rewards/margins": 0.7463994026184082, "rewards/rejected": 0.8367530107498169, "step": 3003 }, { "epoch": 1.62, "learning_rate": 6.743460623441164e-08, "logits/chosen": -2.1813130378723145, "logits/rejected": -2.3059983253479004, "logps/chosen": -0.4929823577404022, "logps/rejected": -0.5639887452125549, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427264332771301, "rewards/margins": 0.01333397626876831, "rewards/rejected": 0.9293924570083618, "step": 3004 }, { "epoch": 1.62, "learning_rate": 6.741413732699007e-08, "logits/chosen": -2.005289077758789, "logits/rejected": -2.016390085220337, "logps/chosen": -2.063760757446289, "logps/rejected": -2.1080639362335205, "loss": 0.4246, "rewards/accuracies": 1.0, "rewards/chosen": 1.2792704105377197, "rewards/margins": 0.636867344379425, "rewards/rejected": 0.6424030661582947, "step": 3005 }, { "epoch": 1.62, "learning_rate": 6.739366509770652e-08, "logits/chosen": -2.058912992477417, "logits/rejected": -2.07226300239563, "logps/chosen": -5.468908786773682, "logps/rejected": -9.353750228881836, "loss": 0.2807, "rewards/accuracies": 1.0, "rewards/chosen": 1.6997969150543213, "rewards/margins": 1.12686026096344, "rewards/rejected": 0.5729366540908813, "step": 3006 }, { "epoch": 1.62, "learning_rate": 6.737318955046621e-08, "logits/chosen": -2.178480386734009, "logits/rejected": -2.2146589756011963, "logps/chosen": -11.598282814025879, "logps/rejected": -12.138051986694336, "loss": 0.4343, "rewards/accuracies": 1.0, "rewards/chosen": 1.346882700920105, "rewards/margins": 0.6090654730796814, "rewards/rejected": 0.7378172278404236, "step": 3007 }, { "epoch": 1.62, "learning_rate": 6.7352710689175e-08, "logits/chosen": -2.0426695346832275, "logits/rejected": -2.291365623474121, "logps/chosen": -0.5381782650947571, "logps/rejected": -0.6430854201316833, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9764003157615662, "rewards/margins": 0.0055422186851501465, "rewards/rejected": 0.970858097076416, "step": 3008 }, { "epoch": 1.62, "learning_rate": 6.733222851773935e-08, "logits/chosen": -2.0099105834960938, "logits/rejected": -2.0087430477142334, "logps/chosen": -2.808722972869873, "logps/rejected": -5.074699878692627, "loss": 0.2704, "rewards/accuracies": 1.0, "rewards/chosen": 1.576380729675293, "rewards/margins": 1.1696032285690308, "rewards/rejected": 0.4067775309085846, "step": 3009 }, { "epoch": 1.62, "learning_rate": 6.731174304006639e-08, "logits/chosen": -2.002943992614746, "logits/rejected": -2.2907519340515137, "logps/chosen": -0.92265784740448, "logps/rejected": -0.9035675525665283, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 1.0035934448242188, "rewards/margins": -0.014506340026855469, "rewards/rejected": 1.0180997848510742, "step": 3010 }, { "epoch": 1.62, "learning_rate": 6.729125426006382e-08, "logits/chosen": -2.1036152839660645, "logits/rejected": -2.112955331802368, "logps/chosen": -0.5768488645553589, "logps/rejected": -6.0601372718811035, "loss": 0.5875, "rewards/accuracies": 1.0, "rewards/chosen": 1.0966447591781616, "rewards/margins": 0.22371309995651245, "rewards/rejected": 0.8729316592216492, "step": 3011 }, { "epoch": 1.62, "learning_rate": 6.727076218164005e-08, "logits/chosen": -2.080115795135498, "logits/rejected": -2.078284740447998, "logps/chosen": -0.7105968594551086, "logps/rejected": -4.239039897918701, "loss": 0.465, "rewards/accuracies": 1.0, "rewards/chosen": 0.9142001271247864, "rewards/margins": 0.524348795413971, "rewards/rejected": 0.38985133171081543, "step": 3012 }, { "epoch": 1.63, "learning_rate": 6.725026680870407e-08, "logits/chosen": -2.146214008331299, "logits/rejected": -2.146911144256592, "logps/chosen": -2.1640188694000244, "logps/rejected": -4.14974308013916, "loss": 0.2784, "rewards/accuracies": 1.0, "rewards/chosen": 1.6846178770065308, "rewards/margins": 1.136307954788208, "rewards/rejected": 0.5483099818229675, "step": 3013 }, { "epoch": 1.63, "learning_rate": 6.72297681451655e-08, "logits/chosen": -2.1744353771209717, "logits/rejected": -2.0393049716949463, "logps/chosen": -30.266653060913086, "logps/rejected": -17.488393783569336, "loss": 0.2976, "rewards/accuracies": 1.0, "rewards/chosen": 1.897552728652954, "rewards/margins": 1.059364914894104, "rewards/rejected": 0.8381878137588501, "step": 3014 }, { "epoch": 1.63, "learning_rate": 6.720926619493462e-08, "logits/chosen": -2.2989354133605957, "logits/rejected": -2.0146381855010986, "logps/chosen": -67.55916595458984, "logps/rejected": -12.636716842651367, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": 3.219153642654419, "rewards/margins": 2.468665838241577, "rewards/rejected": 0.750487744808197, "step": 3015 }, { "epoch": 1.63, "learning_rate": 6.718876096192227e-08, "logits/chosen": -1.9617717266082764, "logits/rejected": -1.9613085985183716, "logps/chosen": -1.1268099546432495, "logps/rejected": -0.9909533858299255, "loss": 0.6075, "rewards/accuracies": 1.0, "rewards/chosen": 0.9394869208335876, "rewards/margins": 0.17938834428787231, "rewards/rejected": 0.7600985765457153, "step": 3016 }, { "epoch": 1.63, "learning_rate": 6.716825245003998e-08, "logits/chosen": -2.1556270122528076, "logits/rejected": -2.2624194622039795, "logps/chosen": -0.7472130656242371, "logps/rejected": -2.1872615814208984, "loss": 0.6519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9053384065628052, "rewards/margins": 0.08431118726730347, "rewards/rejected": 0.8210272192955017, "step": 3017 }, { "epoch": 1.63, "learning_rate": 6.71477406631999e-08, "logits/chosen": -1.928627848625183, "logits/rejected": -2.2500696182250977, "logps/chosen": -0.23252810537815094, "logps/rejected": -0.24348881840705872, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8800846338272095, "rewards/margins": 0.04993170499801636, "rewards/rejected": 0.8301529288291931, "step": 3018 }, { "epoch": 1.63, "learning_rate": 6.712722560531477e-08, "logits/chosen": -2.1852355003356934, "logits/rejected": -2.2999048233032227, "logps/chosen": -0.30924248695373535, "logps/rejected": -0.32280924916267395, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.8778007626533508, "rewards/margins": 0.0023339390754699707, "rewards/rejected": 0.8754668235778809, "step": 3019 }, { "epoch": 1.63, "learning_rate": 6.710670728029797e-08, "logits/chosen": -2.1452512741088867, "logits/rejected": -2.3256895542144775, "logps/chosen": -1.2942253351211548, "logps/rejected": -1.160282015800476, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.8779167532920837, "rewards/margins": 0.001405954360961914, "rewards/rejected": 0.8765107989311218, "step": 3020 }, { "epoch": 1.63, "learning_rate": 6.708618569206352e-08, "logits/chosen": -2.1072909832000732, "logits/rejected": -2.1354238986968994, "logps/chosen": -5.9463701248168945, "logps/rejected": -9.029632568359375, "loss": 0.4054, "rewards/accuracies": 1.0, "rewards/chosen": 1.603017807006836, "rewards/margins": 0.6933973431587219, "rewards/rejected": 0.909620463848114, "step": 3021 }, { "epoch": 1.63, "learning_rate": 6.706566084452604e-08, "logits/chosen": -2.2335402965545654, "logits/rejected": -2.3775665760040283, "logps/chosen": -8.464649200439453, "logps/rejected": -8.929465293884277, "loss": 0.6534, "rewards/accuracies": 1.0, "rewards/chosen": 1.2898972034454346, "rewards/margins": 0.0812140703201294, "rewards/rejected": 1.2086831331253052, "step": 3022 }, { "epoch": 1.63, "learning_rate": 6.704513274160082e-08, "logits/chosen": -2.150972366333008, "logits/rejected": -2.1507880687713623, "logps/chosen": -6.036031723022461, "logps/rejected": -3.751127004623413, "loss": 0.3615, "rewards/accuracies": 1.0, "rewards/chosen": 1.3762083053588867, "rewards/margins": 0.831311047077179, "rewards/rejected": 0.5448972582817078, "step": 3023 }, { "epoch": 1.63, "learning_rate": 6.702460138720367e-08, "logits/chosen": -2.019134044647217, "logits/rejected": -2.309404134750366, "logps/chosen": -1.1295057535171509, "logps/rejected": -0.8403928875923157, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.858810544013977, "rewards/margins": 0.03012716770172119, "rewards/rejected": 0.8286833763122559, "step": 3024 }, { "epoch": 1.63, "learning_rate": 6.700406678525112e-08, "logits/chosen": -2.1245548725128174, "logits/rejected": -2.109589099884033, "logps/chosen": -33.676353454589844, "logps/rejected": -27.016090393066406, "loss": 0.2762, "rewards/accuracies": 1.0, "rewards/chosen": 1.9064865112304688, "rewards/margins": 1.145464301109314, "rewards/rejected": 0.7610222101211548, "step": 3025 }, { "epoch": 1.63, "learning_rate": 6.698352893966024e-08, "logits/chosen": -1.9863184690475464, "logits/rejected": -2.3256335258483887, "logps/chosen": -0.6442306637763977, "logps/rejected": -0.6150355339050293, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 1.0424860715866089, "rewards/margins": 0.0222322940826416, "rewards/rejected": 1.0202537775039673, "step": 3026 }, { "epoch": 1.63, "learning_rate": 6.69629878543488e-08, "logits/chosen": -2.1537036895751953, "logits/rejected": -2.303511619567871, "logps/chosen": -2.063833236694336, "logps/rejected": -0.6736632585525513, "loss": 0.7309, "rewards/accuracies": 0.0, "rewards/chosen": 0.8926555514335632, "rewards/margins": -0.07406151294708252, "rewards/rejected": 0.9667170643806458, "step": 3027 }, { "epoch": 1.63, "learning_rate": 6.694244353323515e-08, "logits/chosen": -2.10912823677063, "logits/rejected": -2.2737808227539062, "logps/chosen": -0.24987486004829407, "logps/rejected": -0.23744478821754456, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": 0.9431408047676086, "rewards/margins": 0.04417228698730469, "rewards/rejected": 0.898968517780304, "step": 3028 }, { "epoch": 1.63, "learning_rate": 6.692189598023823e-08, "logits/chosen": -2.071735382080078, "logits/rejected": -2.0722298622131348, "logps/chosen": -1.0031479597091675, "logps/rejected": -3.3291101455688477, "loss": 0.5251, "rewards/accuracies": 1.0, "rewards/chosen": 1.1675324440002441, "rewards/margins": 0.37009209394454956, "rewards/rejected": 0.7974403500556946, "step": 3029 }, { "epoch": 1.63, "learning_rate": 6.690134519927765e-08, "logits/chosen": -2.108017921447754, "logits/rejected": -2.1053147315979004, "logps/chosen": -1.2250691652297974, "logps/rejected": -2.9660160541534424, "loss": 0.4967, "rewards/accuracies": 1.0, "rewards/chosen": 1.1456712484359741, "rewards/margins": 0.44119346141815186, "rewards/rejected": 0.7044777870178223, "step": 3030 }, { "epoch": 1.63, "learning_rate": 6.68807911942736e-08, "logits/chosen": -2.138180732727051, "logits/rejected": -2.138610601425171, "logps/chosen": -0.31382736563682556, "logps/rejected": -5.607003211975098, "loss": 0.4488, "rewards/accuracies": 1.0, "rewards/chosen": 0.9577189683914185, "rewards/margins": 0.5684726238250732, "rewards/rejected": 0.3892463743686676, "step": 3031 }, { "epoch": 1.64, "learning_rate": 6.686023396914684e-08, "logits/chosen": -2.176718235015869, "logits/rejected": -2.183159589767456, "logps/chosen": -2.407463788986206, "logps/rejected": -5.745490074157715, "loss": 0.3056, "rewards/accuracies": 1.0, "rewards/chosen": 1.6160900592803955, "rewards/margins": 1.0287714004516602, "rewards/rejected": 0.5873185992240906, "step": 3032 }, { "epoch": 1.64, "learning_rate": 6.683967352781888e-08, "logits/chosen": -2.107438087463379, "logits/rejected": -2.286360025405884, "logps/chosen": -1.6399184465408325, "logps/rejected": -1.6077018976211548, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 1.0054476261138916, "rewards/margins": -0.00033986568450927734, "rewards/rejected": 1.0057874917984009, "step": 3033 }, { "epoch": 1.64, "learning_rate": 6.68191098742117e-08, "logits/chosen": -2.0518689155578613, "logits/rejected": -2.2663028240203857, "logps/chosen": -0.510098397731781, "logps/rejected": -1.535423994064331, "loss": 0.624, "rewards/accuracies": 1.0, "rewards/chosen": 1.01369309425354, "rewards/margins": 0.14345306158065796, "rewards/rejected": 0.8702400326728821, "step": 3034 }, { "epoch": 1.64, "learning_rate": 6.679854301224799e-08, "logits/chosen": -2.1057560443878174, "logits/rejected": -2.112008571624756, "logps/chosen": -1.4808125495910645, "logps/rejected": -2.900750160217285, "loss": 0.4422, "rewards/accuracies": 1.0, "rewards/chosen": 1.1297035217285156, "rewards/margins": 0.5866259336471558, "rewards/rejected": 0.5430775880813599, "step": 3035 }, { "epoch": 1.64, "learning_rate": 6.6777972945851e-08, "logits/chosen": -2.127211093902588, "logits/rejected": -2.133040189743042, "logps/chosen": -3.269418478012085, "logps/rejected": -4.116382122039795, "loss": 0.3923, "rewards/accuracies": 1.0, "rewards/chosen": 1.2147505283355713, "rewards/margins": 0.7332686185836792, "rewards/rejected": 0.4814818799495697, "step": 3036 }, { "epoch": 1.64, "learning_rate": 6.675739967894458e-08, "logits/chosen": -2.119222640991211, "logits/rejected": -2.3088722229003906, "logps/chosen": -1.2996735572814941, "logps/rejected": -1.3638279438018799, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9958786368370056, "rewards/margins": 0.020198941230773926, "rewards/rejected": 0.9756796956062317, "step": 3037 }, { "epoch": 1.64, "learning_rate": 6.673682321545326e-08, "logits/chosen": -2.023747205734253, "logits/rejected": -2.2536685466766357, "logps/chosen": -1.024743914604187, "logps/rejected": -0.996989905834198, "loss": 0.667, "rewards/accuracies": 1.0, "rewards/chosen": 1.037822961807251, "rewards/margins": 0.05299544334411621, "rewards/rejected": 0.9848275184631348, "step": 3038 }, { "epoch": 1.64, "learning_rate": 6.671624355930212e-08, "logits/chosen": -2.2028794288635254, "logits/rejected": -2.1798481941223145, "logps/chosen": -4.854517459869385, "logps/rejected": -7.801119804382324, "loss": 0.4576, "rewards/accuracies": 1.0, "rewards/chosen": 1.0630143880844116, "rewards/margins": 0.5442134737968445, "rewards/rejected": 0.5188009142875671, "step": 3039 }, { "epoch": 1.64, "learning_rate": 6.669566071441688e-08, "logits/chosen": -2.0252468585968018, "logits/rejected": -2.2640926837921143, "logps/chosen": -0.27622178196907043, "logps/rejected": -0.333301305770874, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": 0.8830974698066711, "rewards/margins": -0.02192068099975586, "rewards/rejected": 0.905018150806427, "step": 3040 }, { "epoch": 1.64, "learning_rate": 6.667507468472382e-08, "logits/chosen": -2.1169393062591553, "logits/rejected": -2.285855770111084, "logps/chosen": -0.23020513355731964, "logps/rejected": -0.23927226662635803, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9470864534378052, "rewards/margins": 0.012852311134338379, "rewards/rejected": 0.9342341423034668, "step": 3041 }, { "epoch": 1.64, "learning_rate": 6.665448547414987e-08, "logits/chosen": -2.0774922370910645, "logits/rejected": -2.3357038497924805, "logps/chosen": -0.18658998608589172, "logps/rejected": -0.1908314973115921, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.9214221239089966, "rewards/margins": 0.006241023540496826, "rewards/rejected": 0.9151811003684998, "step": 3042 }, { "epoch": 1.64, "learning_rate": 6.663389308662259e-08, "logits/chosen": -2.0523061752319336, "logits/rejected": -2.333852529525757, "logps/chosen": -1.120710849761963, "logps/rejected": -1.2231310606002808, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.7951056361198425, "rewards/margins": 0.0326271653175354, "rewards/rejected": 0.7624784708023071, "step": 3043 }, { "epoch": 1.64, "learning_rate": 6.661329752607008e-08, "logits/chosen": -2.0777785778045654, "logits/rejected": -2.0743980407714844, "logps/chosen": -0.1780613660812378, "logps/rejected": -11.913497924804688, "loss": 0.4974, "rewards/accuracies": 1.0, "rewards/chosen": 0.8735012412071228, "rewards/margins": 0.4394722282886505, "rewards/rejected": 0.4340290129184723, "step": 3044 }, { "epoch": 1.64, "learning_rate": 6.659269879642108e-08, "logits/chosen": -2.0520994663238525, "logits/rejected": -2.3370776176452637, "logps/chosen": -1.2813994884490967, "logps/rejected": -1.157042384147644, "loss": 0.7126, "rewards/accuracies": 0.0, "rewards/chosen": 0.9824675917625427, "rewards/margins": -0.0384557843208313, "rewards/rejected": 1.020923376083374, "step": 3045 }, { "epoch": 1.64, "learning_rate": 6.657209690160499e-08, "logits/chosen": -2.2263410091400146, "logits/rejected": -2.3201308250427246, "logps/chosen": -2.703531503677368, "logps/rejected": -3.068711280822754, "loss": 0.6674, "rewards/accuracies": 1.0, "rewards/chosen": 0.7774016261100769, "rewards/margins": 0.05221754312515259, "rewards/rejected": 0.7251840829849243, "step": 3046 }, { "epoch": 1.64, "learning_rate": 6.655149184555169e-08, "logits/chosen": -2.0526349544525146, "logits/rejected": -2.054615020751953, "logps/chosen": -1.2655789852142334, "logps/rejected": -2.415682792663574, "loss": 0.5064, "rewards/accuracies": 1.0, "rewards/chosen": 1.0659316778182983, "rewards/margins": 0.4165176749229431, "rewards/rejected": 0.6494140028953552, "step": 3047 }, { "epoch": 1.64, "learning_rate": 6.653088363219175e-08, "logits/chosen": -2.115981101989746, "logits/rejected": -2.102613925933838, "logps/chosen": -4.017030239105225, "logps/rejected": -4.746359348297119, "loss": 0.3371, "rewards/accuracies": 1.0, "rewards/chosen": 1.6681938171386719, "rewards/margins": 0.9139546751976013, "rewards/rejected": 0.7542391419410706, "step": 3048 }, { "epoch": 1.64, "learning_rate": 6.651027226545631e-08, "logits/chosen": -2.1703262329101562, "logits/rejected": -2.1645352840423584, "logps/chosen": -7.044727802276611, "logps/rejected": -5.356540203094482, "loss": 0.3622, "rewards/accuracies": 1.0, "rewards/chosen": 1.26378333568573, "rewards/margins": 0.8288644552230835, "rewards/rejected": 0.4349188506603241, "step": 3049 }, { "epoch": 1.65, "learning_rate": 6.648965774927717e-08, "logits/chosen": -2.053589105606079, "logits/rejected": -2.0492498874664307, "logps/chosen": -4.135950088500977, "logps/rejected": -3.4152302742004395, "loss": 0.3001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5573503971099854, "rewards/margins": 1.0497260093688965, "rewards/rejected": 0.5076243877410889, "step": 3050 }, { "epoch": 1.65, "learning_rate": 6.646904008758665e-08, "logits/chosen": -2.0039427280426025, "logits/rejected": -2.0032944679260254, "logps/chosen": -0.7484631538391113, "logps/rejected": -1.6378426551818848, "loss": 0.5958, "rewards/accuracies": 1.0, "rewards/chosen": 1.031601905822754, "rewards/margins": 0.20517385005950928, "rewards/rejected": 0.8264280557632446, "step": 3051 }, { "epoch": 1.65, "learning_rate": 6.644841928431771e-08, "logits/chosen": -2.1174278259277344, "logits/rejected": -2.237243175506592, "logps/chosen": -0.17909467220306396, "logps/rejected": -0.19345462322235107, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.9456514716148376, "rewards/margins": 0.03078460693359375, "rewards/rejected": 0.9148668646812439, "step": 3052 }, { "epoch": 1.65, "learning_rate": 6.642779534340391e-08, "logits/chosen": -2.0944745540618896, "logits/rejected": -2.320377826690674, "logps/chosen": -0.4868090748786926, "logps/rejected": -0.47121620178222656, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 1.0107115507125854, "rewards/margins": 0.0060350894927978516, "rewards/rejected": 1.0046764612197876, "step": 3053 }, { "epoch": 1.65, "learning_rate": 6.640716826877942e-08, "logits/chosen": -2.0818259716033936, "logits/rejected": -2.2752809524536133, "logps/chosen": -0.9773288369178772, "logps/rejected": -1.065787434577942, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.9091874361038208, "rewards/margins": 0.03155338764190674, "rewards/rejected": 0.8776340484619141, "step": 3054 }, { "epoch": 1.65, "learning_rate": 6.638653806437895e-08, "logits/chosen": -2.0839829444885254, "logits/rejected": -2.0946853160858154, "logps/chosen": -1.579211711883545, "logps/rejected": -2.3390040397644043, "loss": 0.5045, "rewards/accuracies": 1.0, "rewards/chosen": 0.953865647315979, "rewards/margins": 0.4213510751724243, "rewards/rejected": 0.5325145721435547, "step": 3055 }, { "epoch": 1.65, "learning_rate": 6.636590473413786e-08, "logits/chosen": -2.038489818572998, "logits/rejected": -2.239405870437622, "logps/chosen": -1.3743746280670166, "logps/rejected": -1.2207505702972412, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.9363409280776978, "rewards/margins": 0.002316117286682129, "rewards/rejected": 0.9340248107910156, "step": 3056 }, { "epoch": 1.65, "learning_rate": 6.634526828199214e-08, "logits/chosen": -2.0424530506134033, "logits/rejected": -2.2592480182647705, "logps/chosen": -0.28673893213272095, "logps/rejected": -0.35026678442955017, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.7736608386039734, "rewards/margins": 0.01246112585067749, "rewards/rejected": 0.7611997127532959, "step": 3057 }, { "epoch": 1.65, "learning_rate": 6.632462871187827e-08, "logits/chosen": -2.133120059967041, "logits/rejected": -2.2356033325195312, "logps/chosen": -0.3554406762123108, "logps/rejected": -0.4657050669193268, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.9595195055007935, "rewards/margins": 0.031370580196380615, "rewards/rejected": 0.9281489253044128, "step": 3058 }, { "epoch": 1.65, "learning_rate": 6.630398602773342e-08, "logits/chosen": -2.2030563354492188, "logits/rejected": -2.282931089401245, "logps/chosen": -8.83775520324707, "logps/rejected": -5.673823833465576, "loss": 0.7045, "rewards/accuracies": 0.0, "rewards/chosen": 0.789096474647522, "rewards/margins": -0.022647440433502197, "rewards/rejected": 0.8117439150810242, "step": 3059 }, { "epoch": 1.65, "learning_rate": 6.628334023349529e-08, "logits/chosen": -2.014143705368042, "logits/rejected": -2.2836074829101562, "logps/chosen": -0.5069746375083923, "logps/rejected": -5.490584373474121, "loss": 0.6099, "rewards/accuracies": 1.0, "rewards/chosen": 0.8165876269340515, "rewards/margins": 0.17403018474578857, "rewards/rejected": 0.6425574421882629, "step": 3060 }, { "epoch": 1.65, "learning_rate": 6.626269133310224e-08, "logits/chosen": -2.020009756088257, "logits/rejected": -2.2808103561401367, "logps/chosen": -0.6359476447105408, "logps/rejected": -0.7536617517471313, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 0.8062605261802673, "rewards/margins": 0.038387835025787354, "rewards/rejected": 0.76787269115448, "step": 3061 }, { "epoch": 1.65, "learning_rate": 6.624203933049316e-08, "logits/chosen": -1.9757567644119263, "logits/rejected": -2.286658763885498, "logps/chosen": -0.9627217054367065, "logps/rejected": -7.908905982971191, "loss": 0.5176, "rewards/accuracies": 1.0, "rewards/chosen": 0.8437907099723816, "rewards/margins": 0.3886262774467468, "rewards/rejected": 0.45516443252563477, "step": 3062 }, { "epoch": 1.65, "learning_rate": 6.622138422960756e-08, "logits/chosen": -2.0522754192352295, "logits/rejected": -2.0536983013153076, "logps/chosen": -1.9255820512771606, "logps/rejected": -1.7094186544418335, "loss": 0.6675, "rewards/accuracies": 1.0, "rewards/chosen": 1.1717851161956787, "rewards/margins": 0.05202984809875488, "rewards/rejected": 1.1197552680969238, "step": 3063 }, { "epoch": 1.65, "learning_rate": 6.620072603438554e-08, "logits/chosen": -2.0365540981292725, "logits/rejected": -2.2982373237609863, "logps/chosen": -5.493653774261475, "logps/rejected": -6.7040276527404785, "loss": 0.6247, "rewards/accuracies": 1.0, "rewards/chosen": 0.6182191371917725, "rewards/margins": 0.14195013046264648, "rewards/rejected": 0.476269006729126, "step": 3064 }, { "epoch": 1.65, "learning_rate": 6.618006474876776e-08, "logits/chosen": -1.9926409721374512, "logits/rejected": -1.9945732355117798, "logps/chosen": -1.5559189319610596, "logps/rejected": -3.4192698001861572, "loss": 0.5441, "rewards/accuracies": 1.0, "rewards/chosen": 0.8879914283752441, "rewards/margins": 0.3243201971054077, "rewards/rejected": 0.5636712312698364, "step": 3065 }, { "epoch": 1.65, "learning_rate": 6.615940037669555e-08, "logits/chosen": -2.0619900226593018, "logits/rejected": -2.2366037368774414, "logps/chosen": -1.935335397720337, "logps/rejected": -4.519250869750977, "loss": 0.6455, "rewards/accuracies": 1.0, "rewards/chosen": 1.0264716148376465, "rewards/margins": 0.0977562665939331, "rewards/rejected": 0.9287153482437134, "step": 3066 }, { "epoch": 1.65, "learning_rate": 6.613873292211075e-08, "logits/chosen": -2.0469958782196045, "logits/rejected": -2.0520036220550537, "logps/chosen": -10.15468978881836, "logps/rejected": -3.992570400238037, "loss": 0.6421, "rewards/accuracies": 1.0, "rewards/chosen": 0.9439598321914673, "rewards/margins": 0.10474282503128052, "rewards/rejected": 0.8392170071601868, "step": 3067 }, { "epoch": 1.65, "learning_rate": 6.611806238895579e-08, "logits/chosen": -2.2034220695495605, "logits/rejected": -2.2022366523742676, "logps/chosen": -2.4912142753601074, "logps/rejected": -5.0526604652404785, "loss": 0.3219, "rewards/accuracies": 1.0, "rewards/chosen": 1.3870853185653687, "rewards/margins": 0.968334972858429, "rewards/rejected": 0.4187503457069397, "step": 3068 }, { "epoch": 1.66, "learning_rate": 6.609738878117374e-08, "logits/chosen": -2.3202295303344727, "logits/rejected": -2.178363084793091, "logps/chosen": -35.34449768066406, "logps/rejected": -1.676389455795288, "loss": 0.1764, "rewards/accuracies": 1.0, "rewards/chosen": 2.5526177883148193, "rewards/margins": 1.645639181137085, "rewards/rejected": 0.9069785475730896, "step": 3069 }, { "epoch": 1.66, "learning_rate": 6.607671210270824e-08, "logits/chosen": -2.2375669479370117, "logits/rejected": -2.0504884719848633, "logps/chosen": -43.112030029296875, "logps/rejected": -2.7393550872802734, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 2.7933967113494873, "rewards/margins": 2.1639914512634277, "rewards/rejected": 0.62940514087677, "step": 3070 }, { "epoch": 1.66, "learning_rate": 6.605603235750347e-08, "logits/chosen": -2.0378522872924805, "logits/rejected": -2.0373196601867676, "logps/chosen": -3.7304909229278564, "logps/rejected": -2.4776458740234375, "loss": 0.4712, "rewards/accuracies": 1.0, "rewards/chosen": 1.3508191108703613, "rewards/margins": 0.5077542066574097, "rewards/rejected": 0.8430649042129517, "step": 3071 }, { "epoch": 1.66, "learning_rate": 6.603534954950424e-08, "logits/chosen": -2.0705161094665527, "logits/rejected": -2.3019721508026123, "logps/chosen": -0.6061137318611145, "logps/rejected": -0.6518568396568298, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 1.1631624698638916, "rewards/margins": 0.006888151168823242, "rewards/rejected": 1.1562743186950684, "step": 3072 }, { "epoch": 1.66, "learning_rate": 6.601466368265594e-08, "logits/chosen": -2.1620302200317383, "logits/rejected": -2.324820041656494, "logps/chosen": -4.928292274475098, "logps/rejected": -0.4901406466960907, "loss": 0.7236, "rewards/accuracies": 0.0, "rewards/chosen": 0.9176916480064392, "rewards/margins": -0.05997061729431152, "rewards/rejected": 0.9776622653007507, "step": 3073 }, { "epoch": 1.66, "learning_rate": 6.599397476090452e-08, "logits/chosen": -2.119133234024048, "logits/rejected": -2.1183433532714844, "logps/chosen": -4.4087910652160645, "logps/rejected": -5.631429672241211, "loss": 0.3389, "rewards/accuracies": 1.0, "rewards/chosen": 1.3776159286499023, "rewards/margins": 0.9078933596611023, "rewards/rejected": 0.46972256898880005, "step": 3074 }, { "epoch": 1.66, "learning_rate": 6.597328278819655e-08, "logits/chosen": -2.158121347427368, "logits/rejected": -2.379948139190674, "logps/chosen": -14.943682670593262, "logps/rejected": -9.778928756713867, "loss": 0.527, "rewards/accuracies": 1.0, "rewards/chosen": 1.136184573173523, "rewards/margins": 0.36557239294052124, "rewards/rejected": 0.7706121802330017, "step": 3075 }, { "epoch": 1.66, "learning_rate": 6.595258776847914e-08, "logits/chosen": -2.0332512855529785, "logits/rejected": -2.0285675525665283, "logps/chosen": -5.836934566497803, "logps/rejected": -3.53450345993042, "loss": 0.3304, "rewards/accuracies": 1.0, "rewards/chosen": 1.4786211252212524, "rewards/margins": 0.9376620650291443, "rewards/rejected": 0.5409590601921082, "step": 3076 }, { "epoch": 1.66, "learning_rate": 6.593188970570003e-08, "logits/chosen": -2.0491349697113037, "logits/rejected": -2.3226888179779053, "logps/chosen": -0.3501383364200592, "logps/rejected": -0.35440388321876526, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.8876771330833435, "rewards/margins": 0.01890021562576294, "rewards/rejected": 0.8687769174575806, "step": 3077 }, { "epoch": 1.66, "learning_rate": 6.591118860380748e-08, "logits/chosen": -2.0769853591918945, "logits/rejected": -2.264404773712158, "logps/chosen": -0.39856863021850586, "logps/rejected": -0.44393643736839294, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.945809006690979, "rewards/margins": 0.01871776580810547, "rewards/rejected": 0.9270912408828735, "step": 3078 }, { "epoch": 1.66, "learning_rate": 6.589048446675038e-08, "logits/chosen": -2.131075382232666, "logits/rejected": -2.2894790172576904, "logps/chosen": -0.4174540638923645, "logps/rejected": -0.44113579392433167, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.8295709490776062, "rewards/margins": 0.02719855308532715, "rewards/rejected": 0.802372395992279, "step": 3079 }, { "epoch": 1.66, "learning_rate": 6.586977729847818e-08, "logits/chosen": -2.060920476913452, "logits/rejected": -2.0992696285247803, "logps/chosen": -3.733883857727051, "logps/rejected": -9.748733520507812, "loss": 0.2489, "rewards/accuracies": 1.0, "rewards/chosen": 1.7113689184188843, "rewards/margins": 1.2635846138000488, "rewards/rejected": 0.4477842450141907, "step": 3080 }, { "epoch": 1.66, "learning_rate": 6.584906710294091e-08, "logits/chosen": -2.1581368446350098, "logits/rejected": -2.2902982234954834, "logps/chosen": -0.23787575960159302, "logps/rejected": -0.24842143058776855, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.9453014731407166, "rewards/margins": 0.013867974281311035, "rewards/rejected": 0.9314334988594055, "step": 3081 }, { "epoch": 1.66, "learning_rate": 6.582835388408915e-08, "logits/chosen": -2.084054708480835, "logits/rejected": -2.084840774536133, "logps/chosen": -1.5077072381973267, "logps/rejected": -1.9212621450424194, "loss": 0.6467, "rewards/accuracies": 1.0, "rewards/chosen": 1.0099066495895386, "rewards/margins": 0.09521746635437012, "rewards/rejected": 0.9146891832351685, "step": 3082 }, { "epoch": 1.66, "learning_rate": 6.580763764587412e-08, "logits/chosen": -2.110799551010132, "logits/rejected": -2.2771358489990234, "logps/chosen": -0.7985734343528748, "logps/rejected": -0.8016313910484314, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9075485467910767, "rewards/margins": 0.001183927059173584, "rewards/rejected": 0.9063646197319031, "step": 3083 }, { "epoch": 1.66, "learning_rate": 6.578691839224759e-08, "logits/chosen": -2.148658275604248, "logits/rejected": -2.296483278274536, "logps/chosen": -2.0529675483703613, "logps/rejected": -2.355642318725586, "loss": 0.6692, "rewards/accuracies": 1.0, "rewards/chosen": 0.7190874814987183, "rewards/margins": 0.04848754405975342, "rewards/rejected": 0.6705999374389648, "step": 3084 }, { "epoch": 1.66, "learning_rate": 6.576619612716187e-08, "logits/chosen": -2.0570900440216064, "logits/rejected": -2.0632967948913574, "logps/chosen": -2.1869771480560303, "logps/rejected": -3.7581441402435303, "loss": 0.3342, "rewards/accuracies": 1.0, "rewards/chosen": 1.4672354459762573, "rewards/margins": 0.9243234395980835, "rewards/rejected": 0.5429120063781738, "step": 3085 }, { "epoch": 1.66, "learning_rate": 6.574547085456985e-08, "logits/chosen": -2.231196165084839, "logits/rejected": -2.248997211456299, "logps/chosen": -2.5244691371917725, "logps/rejected": -8.372783660888672, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 1.4082201719284058, "rewards/margins": 0.889471709728241, "rewards/rejected": 0.5187484622001648, "step": 3086 }, { "epoch": 1.67, "learning_rate": 6.572474257842504e-08, "logits/chosen": -2.1305551528930664, "logits/rejected": -2.1311111450195312, "logps/chosen": -1.0168319940567017, "logps/rejected": -1.789839267730713, "loss": 0.5177, "rewards/accuracies": 1.0, "rewards/chosen": 1.1787763833999634, "rewards/margins": 0.3884672522544861, "rewards/rejected": 0.7903091311454773, "step": 3087 }, { "epoch": 1.67, "learning_rate": 6.570401130268148e-08, "logits/chosen": -2.2053520679473877, "logits/rejected": -2.2039551734924316, "logps/chosen": -0.6925903558731079, "logps/rejected": -6.805544376373291, "loss": 0.3895, "rewards/accuracies": 1.0, "rewards/chosen": 1.0718921422958374, "rewards/margins": 0.7419303059577942, "rewards/rejected": 0.3299618363380432, "step": 3088 }, { "epoch": 1.67, "learning_rate": 6.568327703129382e-08, "logits/chosen": -2.1082403659820557, "logits/rejected": -2.088813304901123, "logps/chosen": -4.316989898681641, "logps/rejected": -5.064521789550781, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 1.1415269374847412, "rewards/margins": 0.5210555195808411, "rewards/rejected": 0.6204714179039001, "step": 3089 }, { "epoch": 1.67, "learning_rate": 6.566253976821727e-08, "logits/chosen": -2.000274181365967, "logits/rejected": -2.001296281814575, "logps/chosen": -0.19537076354026794, "logps/rejected": -5.609036922454834, "loss": 0.4473, "rewards/accuracies": 1.0, "rewards/chosen": 0.9466863870620728, "rewards/margins": 0.5725610256195068, "rewards/rejected": 0.37412533164024353, "step": 3090 }, { "epoch": 1.67, "learning_rate": 6.564179951740755e-08, "logits/chosen": -2.1050660610198975, "logits/rejected": -2.2807774543762207, "logps/chosen": -1.538882851600647, "logps/rejected": -1.3943779468536377, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.8292970061302185, "rewards/margins": 0.011829793453216553, "rewards/rejected": 0.817467212677002, "step": 3091 }, { "epoch": 1.67, "learning_rate": 6.562105628282104e-08, "logits/chosen": -2.1161646842956543, "logits/rejected": -2.2239561080932617, "logps/chosen": -1.355594515800476, "logps/rejected": -1.4277924299240112, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.9323736429214478, "rewards/margins": -0.014871001243591309, "rewards/rejected": 0.9472446441650391, "step": 3092 }, { "epoch": 1.67, "learning_rate": 6.560031006841463e-08, "logits/chosen": -2.0224528312683105, "logits/rejected": -2.0269861221313477, "logps/chosen": -5.074896812438965, "logps/rejected": -1.6734366416931152, "loss": 0.6056, "rewards/accuracies": 1.0, "rewards/chosen": 0.9954089522361755, "rewards/margins": 0.18354427814483643, "rewards/rejected": 0.8118646740913391, "step": 3093 }, { "epoch": 1.67, "learning_rate": 6.557956087814582e-08, "logits/chosen": -2.053119421005249, "logits/rejected": -2.220857858657837, "logps/chosen": -0.5128113627433777, "logps/rejected": -1.7201740741729736, "loss": 0.6649, "rewards/accuracies": 1.0, "rewards/chosen": 0.940457284450531, "rewards/margins": 0.05724763870239258, "rewards/rejected": 0.8832096457481384, "step": 3094 }, { "epoch": 1.67, "learning_rate": 6.555880871597261e-08, "logits/chosen": -2.053154945373535, "logits/rejected": -2.2920472621917725, "logps/chosen": -1.3400192260742188, "logps/rejected": -4.012760639190674, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 1.0388292074203491, "rewards/margins": 0.024732112884521484, "rewards/rejected": 1.0140970945358276, "step": 3095 }, { "epoch": 1.67, "learning_rate": 6.553805358585367e-08, "logits/chosen": -1.9400324821472168, "logits/rejected": -1.949847936630249, "logps/chosen": -1.3727102279663086, "logps/rejected": -2.8691728115081787, "loss": 0.5131, "rewards/accuracies": 1.0, "rewards/chosen": 1.0317790508270264, "rewards/margins": 0.39969152212142944, "rewards/rejected": 0.6320875287055969, "step": 3096 }, { "epoch": 1.67, "learning_rate": 6.551729549174814e-08, "logits/chosen": -2.0587689876556396, "logits/rejected": -2.224933624267578, "logps/chosen": -4.35481595993042, "logps/rejected": -6.441746234893799, "loss": 0.502, "rewards/accuracies": 1.0, "rewards/chosen": 0.8960846066474915, "rewards/margins": 0.4277707636356354, "rewards/rejected": 0.4683138430118561, "step": 3097 }, { "epoch": 1.67, "learning_rate": 6.549653443761578e-08, "logits/chosen": -1.9936840534210205, "logits/rejected": -2.286404609680176, "logps/chosen": -0.1781689077615738, "logps/rejected": -0.19132664799690247, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9058647155761719, "rewards/margins": 0.012504100799560547, "rewards/rejected": 0.8933606147766113, "step": 3098 }, { "epoch": 1.67, "learning_rate": 6.54757704274169e-08, "logits/chosen": -2.0116140842437744, "logits/rejected": -2.0232784748077393, "logps/chosen": -2.7772176265716553, "logps/rejected": -1.6031829118728638, "loss": 0.4261, "rewards/accuracies": 1.0, "rewards/chosen": 1.3222562074661255, "rewards/margins": 0.632377564907074, "rewards/rejected": 0.6898786425590515, "step": 3099 }, { "epoch": 1.67, "learning_rate": 6.545500346511237e-08, "logits/chosen": -1.9911413192749023, "logits/rejected": -2.2261693477630615, "logps/chosen": -1.9403377771377563, "logps/rejected": -0.9426394104957581, "loss": 0.6666, "rewards/accuracies": 1.0, "rewards/chosen": 0.8196404576301575, "rewards/margins": 0.05390673875808716, "rewards/rejected": 0.7657337188720703, "step": 3100 }, { "epoch": 1.67, "learning_rate": 6.54342335546636e-08, "logits/chosen": -2.033034324645996, "logits/rejected": -2.023210048675537, "logps/chosen": -9.433197021484375, "logps/rejected": -0.7361069321632385, "loss": 0.5665, "rewards/accuracies": 1.0, "rewards/chosen": 1.1852397918701172, "rewards/margins": 0.27178704738616943, "rewards/rejected": 0.9134527444839478, "step": 3101 }, { "epoch": 1.67, "learning_rate": 6.541346070003264e-08, "logits/chosen": -2.1756465435028076, "logits/rejected": -2.1812422275543213, "logps/chosen": -0.18619118630886078, "logps/rejected": -4.173090934753418, "loss": 0.4763, "rewards/accuracies": 1.0, "rewards/chosen": 0.8986508250236511, "rewards/margins": 0.49413976073265076, "rewards/rejected": 0.40451106429100037, "step": 3102 }, { "epoch": 1.67, "learning_rate": 6.5392684905182e-08, "logits/chosen": -2.026451826095581, "logits/rejected": -2.0267927646636963, "logps/chosen": -1.321323037147522, "logps/rejected": -2.1577301025390625, "loss": 0.5857, "rewards/accuracies": 1.0, "rewards/chosen": 1.086011290550232, "rewards/margins": 0.22793930768966675, "rewards/rejected": 0.8580719828605652, "step": 3103 }, { "epoch": 1.67, "learning_rate": 6.537190617407481e-08, "logits/chosen": -2.11263370513916, "logits/rejected": -2.112088918685913, "logps/chosen": -0.7742518186569214, "logps/rejected": -3.255314588546753, "loss": 0.4935, "rewards/accuracies": 1.0, "rewards/chosen": 1.0229462385177612, "rewards/margins": 0.44931095838546753, "rewards/rejected": 0.5736352801322937, "step": 3104 }, { "epoch": 1.67, "learning_rate": 6.535112451067476e-08, "logits/chosen": -2.055917501449585, "logits/rejected": -2.058162212371826, "logps/chosen": -1.1087555885314941, "logps/rejected": -2.6301302909851074, "loss": 0.5372, "rewards/accuracies": 1.0, "rewards/chosen": 0.904736340045929, "rewards/margins": 0.3407326936721802, "rewards/rejected": 0.5640036463737488, "step": 3105 }, { "epoch": 1.68, "learning_rate": 6.53303399189461e-08, "logits/chosen": -2.106431484222412, "logits/rejected": -2.105600357055664, "logps/chosen": -0.8962120413780212, "logps/rejected": -1.2204616069793701, "loss": 0.637, "rewards/accuracies": 1.0, "rewards/chosen": 0.9821414947509766, "rewards/margins": 0.11567896604537964, "rewards/rejected": 0.8664625287055969, "step": 3106 }, { "epoch": 1.68, "learning_rate": 6.530955240285362e-08, "logits/chosen": -2.074350357055664, "logits/rejected": -2.218418836593628, "logps/chosen": -2.5842318534851074, "logps/rejected": -2.5867865085601807, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.7198707461357117, "rewards/margins": 0.0073857903480529785, "rewards/rejected": 0.7124849557876587, "step": 3107 }, { "epoch": 1.68, "learning_rate": 6.528876196636269e-08, "logits/chosen": -2.0407581329345703, "logits/rejected": -2.0966145992279053, "logps/chosen": -1.6906836032867432, "logps/rejected": -7.919936656951904, "loss": 0.4552, "rewards/accuracies": 1.0, "rewards/chosen": 1.4880962371826172, "rewards/margins": 0.5508079528808594, "rewards/rejected": 0.9372882843017578, "step": 3108 }, { "epoch": 1.68, "learning_rate": 6.526796861343918e-08, "logits/chosen": -2.107621908187866, "logits/rejected": -2.1151881217956543, "logps/chosen": -1.9862730503082275, "logps/rejected": -3.546593427658081, "loss": 0.4608, "rewards/accuracies": 1.0, "rewards/chosen": 1.3120317459106445, "rewards/margins": 0.5355701446533203, "rewards/rejected": 0.7764616012573242, "step": 3109 }, { "epoch": 1.68, "learning_rate": 6.52471723480496e-08, "logits/chosen": -2.1535542011260986, "logits/rejected": -2.296470880508423, "logps/chosen": -0.41698968410491943, "logps/rejected": -0.4062041640281677, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.947568416595459, "rewards/margins": 0.01372385025024414, "rewards/rejected": 0.9338445663452148, "step": 3110 }, { "epoch": 1.68, "learning_rate": 6.522637317416095e-08, "logits/chosen": -2.174917221069336, "logits/rejected": -2.0720160007476807, "logps/chosen": -24.953842163085938, "logps/rejected": -3.94472336769104, "loss": 0.1611, "rewards/accuracies": 1.0, "rewards/chosen": 2.3007867336273193, "rewards/margins": 1.7441320419311523, "rewards/rejected": 0.5566547513008118, "step": 3111 }, { "epoch": 1.68, "learning_rate": 6.520557109574085e-08, "logits/chosen": -2.0579562187194824, "logits/rejected": -2.320875406265259, "logps/chosen": -1.0355716943740845, "logps/rejected": -1.0484471321105957, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8014622926712036, "rewards/margins": 0.02073991298675537, "rewards/rejected": 0.7807223796844482, "step": 3112 }, { "epoch": 1.68, "learning_rate": 6.518476611675739e-08, "logits/chosen": -2.1010544300079346, "logits/rejected": -2.100611448287964, "logps/chosen": -0.8910528421401978, "logps/rejected": -1.4113553762435913, "loss": 0.6535, "rewards/accuracies": 1.0, "rewards/chosen": 0.9333480000495911, "rewards/margins": 0.08092671632766724, "rewards/rejected": 0.8524212837219238, "step": 3113 }, { "epoch": 1.68, "learning_rate": 6.516395824117928e-08, "logits/chosen": -2.1359055042266846, "logits/rejected": -2.147252321243286, "logps/chosen": -2.4907593727111816, "logps/rejected": -3.4219970703125, "loss": 0.476, "rewards/accuracies": 1.0, "rewards/chosen": 1.4153696298599243, "rewards/margins": 0.494894802570343, "rewards/rejected": 0.9204748272895813, "step": 3114 }, { "epoch": 1.68, "learning_rate": 6.514314747297575e-08, "logits/chosen": -2.012190818786621, "logits/rejected": -2.020374298095703, "logps/chosen": -1.496820330619812, "logps/rejected": -3.6874639987945557, "loss": 0.487, "rewards/accuracies": 1.0, "rewards/chosen": 0.9719231724739075, "rewards/margins": 0.4662567377090454, "rewards/rejected": 0.5056664347648621, "step": 3115 }, { "epoch": 1.68, "learning_rate": 6.51223338161166e-08, "logits/chosen": -2.1012117862701416, "logits/rejected": -2.270202159881592, "logps/chosen": -1.254207968711853, "logps/rejected": -9.90455436706543, "loss": 0.6341, "rewards/accuracies": 1.0, "rewards/chosen": 1.0393742322921753, "rewards/margins": 0.1217268705368042, "rewards/rejected": 0.9176473617553711, "step": 3116 }, { "epoch": 1.68, "learning_rate": 6.510151727457217e-08, "logits/chosen": -1.958517074584961, "logits/rejected": -2.304271936416626, "logps/chosen": -0.1621674746274948, "logps/rejected": -0.19936783611774445, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.9487878084182739, "rewards/margins": 0.0025842785835266113, "rewards/rejected": 0.9462035298347473, "step": 3117 }, { "epoch": 1.68, "learning_rate": 6.508069785231336e-08, "logits/chosen": -1.9404338598251343, "logits/rejected": -1.9502314329147339, "logps/chosen": -1.2496005296707153, "logps/rejected": -2.7527825832366943, "loss": 0.4775, "rewards/accuracies": 1.0, "rewards/chosen": 1.0797282457351685, "rewards/margins": 0.4910009503364563, "rewards/rejected": 0.5887272953987122, "step": 3118 }, { "epoch": 1.68, "learning_rate": 6.505987555331159e-08, "logits/chosen": -2.0532636642456055, "logits/rejected": -2.0441060066223145, "logps/chosen": -4.318019866943359, "logps/rejected": -3.2751760482788086, "loss": 0.4226, "rewards/accuracies": 1.0, "rewards/chosen": 1.7787243127822876, "rewards/margins": 0.6425608396530151, "rewards/rejected": 1.1361634731292725, "step": 3119 }, { "epoch": 1.68, "learning_rate": 6.503905038153887e-08, "logits/chosen": -2.0395877361297607, "logits/rejected": -2.0226688385009766, "logps/chosen": -2.368703842163086, "logps/rejected": -3.3011879920959473, "loss": 0.4184, "rewards/accuracies": 1.0, "rewards/chosen": 1.2606815099716187, "rewards/margins": 0.6547073721885681, "rewards/rejected": 0.6059741377830505, "step": 3120 }, { "epoch": 1.68, "learning_rate": 6.501822234096774e-08, "logits/chosen": -2.1369638442993164, "logits/rejected": -2.1292145252227783, "logps/chosen": -3.3602354526519775, "logps/rejected": -4.716312885284424, "loss": 0.5186, "rewards/accuracies": 1.0, "rewards/chosen": 1.3658875226974487, "rewards/margins": 0.38621604442596436, "rewards/rejected": 0.9796714782714844, "step": 3121 }, { "epoch": 1.68, "learning_rate": 6.499739143557129e-08, "logits/chosen": -2.01413893699646, "logits/rejected": -2.0205001831054688, "logps/chosen": -1.512507677078247, "logps/rejected": -4.62967586517334, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 1.1359190940856934, "rewards/margins": 0.6346898674964905, "rewards/rejected": 0.5012292265892029, "step": 3122 }, { "epoch": 1.68, "learning_rate": 6.497655766932314e-08, "logits/chosen": -2.19626522064209, "logits/rejected": -2.30839467048645, "logps/chosen": -2.141723394393921, "logps/rejected": -2.133922815322876, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.7582876086235046, "rewards/margins": 0.019909262657165527, "rewards/rejected": 0.7383783459663391, "step": 3123 }, { "epoch": 1.69, "learning_rate": 6.495572104619748e-08, "logits/chosen": -2.0975568294525146, "logits/rejected": -2.2867701053619385, "logps/chosen": -0.5741572380065918, "logps/rejected": -0.63103848695755, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.7112693190574646, "rewards/margins": 0.012249410152435303, "rewards/rejected": 0.6990199089050293, "step": 3124 }, { "epoch": 1.69, "learning_rate": 6.493488157016902e-08, "logits/chosen": -2.080303430557251, "logits/rejected": -2.265022039413452, "logps/chosen": -0.34687572717666626, "logps/rejected": -0.43395906686782837, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.9236213564872742, "rewards/margins": 0.02509331703186035, "rewards/rejected": 0.8985280394554138, "step": 3125 }, { "epoch": 1.69, "learning_rate": 6.491403924521304e-08, "logits/chosen": -2.2465720176696777, "logits/rejected": -2.2372756004333496, "logps/chosen": -1.195116639137268, "logps/rejected": -3.7792835235595703, "loss": 0.4763, "rewards/accuracies": 1.0, "rewards/chosen": 0.9846601486206055, "rewards/margins": 0.49412718415260315, "rewards/rejected": 0.4905329644680023, "step": 3126 }, { "epoch": 1.69, "learning_rate": 6.489319407530532e-08, "logits/chosen": -1.998304843902588, "logits/rejected": -2.013787269592285, "logps/chosen": -1.4990614652633667, "logps/rejected": -8.907251358032227, "loss": 0.4469, "rewards/accuracies": 1.0, "rewards/chosen": 1.160188913345337, "rewards/margins": 0.5735300183296204, "rewards/rejected": 0.5866588950157166, "step": 3127 }, { "epoch": 1.69, "learning_rate": 6.487234606442228e-08, "logits/chosen": -2.102011203765869, "logits/rejected": -2.384104013442993, "logps/chosen": -16.832788467407227, "logps/rejected": -13.984315872192383, "loss": 0.7916, "rewards/accuracies": 0.0, "rewards/chosen": 1.0749952793121338, "rewards/margins": -0.18812525272369385, "rewards/rejected": 1.2631205320358276, "step": 3128 }, { "epoch": 1.69, "learning_rate": 6.485149521654077e-08, "logits/chosen": -2.0449161529541016, "logits/rejected": -2.315497636795044, "logps/chosen": -0.7011473178863525, "logps/rejected": -0.8792531490325928, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.9316725134849548, "rewards/margins": 0.004347801208496094, "rewards/rejected": 0.9273247122764587, "step": 3129 }, { "epoch": 1.69, "learning_rate": 6.483064153563823e-08, "logits/chosen": -1.992780089378357, "logits/rejected": -1.9875705242156982, "logps/chosen": -6.99166202545166, "logps/rejected": -3.6536548137664795, "loss": 0.4337, "rewards/accuracies": 1.0, "rewards/chosen": 1.6315525770187378, "rewards/margins": 0.6107257604598999, "rewards/rejected": 1.020826816558838, "step": 3130 }, { "epoch": 1.69, "learning_rate": 6.480978502569264e-08, "logits/chosen": -2.3418428897857666, "logits/rejected": -2.197129964828491, "logps/chosen": -22.209321975708008, "logps/rejected": -1.7313623428344727, "loss": 0.1932, "rewards/accuracies": 1.0, "rewards/chosen": 2.191758155822754, "rewards/margins": 1.5459158420562744, "rewards/rejected": 0.6458423733711243, "step": 3131 }, { "epoch": 1.69, "learning_rate": 6.478892569068253e-08, "logits/chosen": -2.077411651611328, "logits/rejected": -2.077697992324829, "logps/chosen": -0.6566867828369141, "logps/rejected": -2.542691469192505, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 0.8112139105796814, "rewards/margins": 0.19900161027908325, "rewards/rejected": 0.6122123003005981, "step": 3132 }, { "epoch": 1.69, "learning_rate": 6.476806353458693e-08, "logits/chosen": -2.091158151626587, "logits/rejected": -2.0983214378356934, "logps/chosen": -2.627201795578003, "logps/rejected": -5.951407432556152, "loss": 0.3745, "rewards/accuracies": 1.0, "rewards/chosen": 1.134097933769226, "rewards/margins": 0.7891891002655029, "rewards/rejected": 0.34490880370140076, "step": 3133 }, { "epoch": 1.69, "learning_rate": 6.474719856138547e-08, "logits/chosen": -2.0362465381622314, "logits/rejected": -2.0374953746795654, "logps/chosen": -2.3609180450439453, "logps/rejected": -1.0846238136291504, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.5427439212799072, "rewards/margins": 0.6038742661476135, "rewards/rejected": 0.9388696551322937, "step": 3134 }, { "epoch": 1.69, "learning_rate": 6.472633077505824e-08, "logits/chosen": -2.1019370555877686, "logits/rejected": -2.111999750137329, "logps/chosen": -17.31130027770996, "logps/rejected": -7.675990581512451, "loss": 0.1869, "rewards/accuracies": 1.0, "rewards/chosen": 2.171462059020996, "rewards/margins": 1.5825085639953613, "rewards/rejected": 0.58895343542099, "step": 3135 }, { "epoch": 1.69, "learning_rate": 6.470546017958595e-08, "logits/chosen": -2.0873863697052, "logits/rejected": -2.301008462905884, "logps/chosen": -2.775939464569092, "logps/rejected": -2.786012649536133, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.6871910691261292, "rewards/margins": 0.042759835720062256, "rewards/rejected": 0.6444312334060669, "step": 3136 }, { "epoch": 1.69, "learning_rate": 6.468458677894978e-08, "logits/chosen": -1.8601056337356567, "logits/rejected": -2.305610418319702, "logps/chosen": -0.7540731430053711, "logps/rejected": -1.0071728229522705, "loss": 0.6622, "rewards/accuracies": 1.0, "rewards/chosen": 0.7871500253677368, "rewards/margins": 0.06283777952194214, "rewards/rejected": 0.7243122458457947, "step": 3137 }, { "epoch": 1.69, "learning_rate": 6.466371057713147e-08, "logits/chosen": -1.9886963367462158, "logits/rejected": -2.3170695304870605, "logps/chosen": -1.0166101455688477, "logps/rejected": -0.9853770732879639, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.7362451553344727, "rewards/margins": 0.030139625072479248, "rewards/rejected": 0.7061055302619934, "step": 3138 }, { "epoch": 1.69, "learning_rate": 6.464283157811331e-08, "logits/chosen": -2.0456748008728027, "logits/rejected": -2.0432486534118652, "logps/chosen": -2.099181652069092, "logps/rejected": -5.572704792022705, "loss": 0.415, "rewards/accuracies": 1.0, "rewards/chosen": 1.0270007848739624, "rewards/margins": 0.6646641492843628, "rewards/rejected": 0.3623366057872772, "step": 3139 }, { "epoch": 1.69, "learning_rate": 6.462194978587808e-08, "logits/chosen": -2.174018383026123, "logits/rejected": -2.2009694576263428, "logps/chosen": -15.346765518188477, "logps/rejected": -3.8787951469421387, "loss": 0.3613, "rewards/accuracies": 1.0, "rewards/chosen": 1.845990538597107, "rewards/margins": 0.8319648504257202, "rewards/rejected": 1.0140256881713867, "step": 3140 }, { "epoch": 1.69, "learning_rate": 6.460106520440915e-08, "logits/chosen": -2.16153621673584, "logits/rejected": -2.268329620361328, "logps/chosen": -3.1263859272003174, "logps/rejected": -3.1816298961639404, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 1.079403042793274, "rewards/margins": 0.02870941162109375, "rewards/rejected": 1.0506936311721802, "step": 3141 }, { "epoch": 1.69, "learning_rate": 6.458017783769038e-08, "logits/chosen": -2.0518598556518555, "logits/rejected": -2.2954602241516113, "logps/chosen": -3.596449136734009, "logps/rejected": -3.3276450634002686, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.4551004469394684, "rewards/margins": -0.001194775104522705, "rewards/rejected": 0.4562952220439911, "step": 3142 }, { "epoch": 1.7, "learning_rate": 6.455928768970616e-08, "logits/chosen": -1.943052887916565, "logits/rejected": -1.9380731582641602, "logps/chosen": -4.253018856048584, "logps/rejected": -4.166345119476318, "loss": 0.2713, "rewards/accuracies": 1.0, "rewards/chosen": 1.5568822622299194, "rewards/margins": 1.1659719944000244, "rewards/rejected": 0.39091020822525024, "step": 3143 }, { "epoch": 1.7, "learning_rate": 6.453839476444143e-08, "logits/chosen": -2.1677820682525635, "logits/rejected": -2.1717545986175537, "logps/chosen": -2.8525373935699463, "logps/rejected": -4.1271538734436035, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 1.207196593284607, "rewards/margins": 0.6950141787528992, "rewards/rejected": 0.5121824145317078, "step": 3144 }, { "epoch": 1.7, "learning_rate": 6.451749906588166e-08, "logits/chosen": -2.025275230407715, "logits/rejected": -2.2063772678375244, "logps/chosen": -0.5418844819068909, "logps/rejected": -0.5678712725639343, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 0.9492616653442383, "rewards/margins": 0.0436747670173645, "rewards/rejected": 0.9055868983268738, "step": 3145 }, { "epoch": 1.7, "learning_rate": 6.449660059801286e-08, "logits/chosen": -2.0570404529571533, "logits/rejected": -2.2616472244262695, "logps/chosen": -0.40468454360961914, "logps/rejected": -0.39850613474845886, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.8338503241539001, "rewards/margins": -0.008082389831542969, "rewards/rejected": 0.8419327139854431, "step": 3146 }, { "epoch": 1.7, "learning_rate": 6.447569936482152e-08, "logits/chosen": -2.0817642211914062, "logits/rejected": -2.2844088077545166, "logps/chosen": -6.900755882263184, "logps/rejected": -10.673393249511719, "loss": 0.6194, "rewards/accuracies": 1.0, "rewards/chosen": 0.7864323854446411, "rewards/margins": 0.15342897176742554, "rewards/rejected": 0.6330034136772156, "step": 3147 }, { "epoch": 1.7, "learning_rate": 6.44547953702947e-08, "logits/chosen": -1.9648727178573608, "logits/rejected": -1.9658175706863403, "logps/chosen": -3.862758159637451, "logps/rejected": -1.8152194023132324, "loss": 0.3768, "rewards/accuracies": 1.0, "rewards/chosen": 1.474115014076233, "rewards/margins": 0.7818667888641357, "rewards/rejected": 0.6922482252120972, "step": 3148 }, { "epoch": 1.7, "learning_rate": 6.443388861841998e-08, "logits/chosen": -1.9947586059570312, "logits/rejected": -2.2331953048706055, "logps/chosen": -0.6816838979721069, "logps/rejected": -0.7788296341896057, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.1195342540740967, "rewards/margins": 0.011210083961486816, "rewards/rejected": 1.1083241701126099, "step": 3149 }, { "epoch": 1.7, "learning_rate": 6.441297911318546e-08, "logits/chosen": -2.0007903575897217, "logits/rejected": -2.003835916519165, "logps/chosen": -3.725771427154541, "logps/rejected": -1.4927581548690796, "loss": 0.6469, "rewards/accuracies": 1.0, "rewards/chosen": 1.2963981628417969, "rewards/margins": 0.09482002258300781, "rewards/rejected": 1.201578140258789, "step": 3150 }, { "epoch": 1.7, "learning_rate": 6.439206685857977e-08, "logits/chosen": -2.1057422161102295, "logits/rejected": -1.9743250608444214, "logps/chosen": -18.422039031982422, "logps/rejected": -8.653820991516113, "loss": 0.3349, "rewards/accuracies": 1.0, "rewards/chosen": 1.7251007556915283, "rewards/margins": 0.9218732118606567, "rewards/rejected": 0.8032275438308716, "step": 3151 }, { "epoch": 1.7, "learning_rate": 6.437115185859204e-08, "logits/chosen": -2.1283016204833984, "logits/rejected": -2.128739595413208, "logps/chosen": -1.096817135810852, "logps/rejected": -3.875997543334961, "loss": 0.5551, "rewards/accuracies": 1.0, "rewards/chosen": 0.9080063700675964, "rewards/margins": 0.29836326837539673, "rewards/rejected": 0.6096431016921997, "step": 3152 }, { "epoch": 1.7, "learning_rate": 6.435023411721197e-08, "logits/chosen": -2.138026237487793, "logits/rejected": -2.331810712814331, "logps/chosen": -1.442763328552246, "logps/rejected": -1.5014622211456299, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.6668953895568848, "rewards/margins": 0.024741649627685547, "rewards/rejected": 0.6421537399291992, "step": 3153 }, { "epoch": 1.7, "learning_rate": 6.432931363842977e-08, "logits/chosen": -2.0452985763549805, "logits/rejected": -2.325479745864868, "logps/chosen": -6.143372058868408, "logps/rejected": -5.849984645843506, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.899744987487793, "rewards/margins": -0.01061403751373291, "rewards/rejected": 0.9103590250015259, "step": 3154 }, { "epoch": 1.7, "learning_rate": 6.43083904262361e-08, "logits/chosen": -1.9603999853134155, "logits/rejected": -2.297865867614746, "logps/chosen": -0.7735674381256104, "logps/rejected": -11.568907737731934, "loss": 0.7848, "rewards/accuracies": 0.0, "rewards/chosen": 0.8750881552696228, "rewards/margins": -0.17552119493484497, "rewards/rejected": 1.0506093502044678, "step": 3155 }, { "epoch": 1.7, "learning_rate": 6.428746448462225e-08, "logits/chosen": -2.0807864665985107, "logits/rejected": -2.284802198410034, "logps/chosen": -0.19409048557281494, "logps/rejected": -0.18857403099536896, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.869088351726532, "rewards/margins": 0.03051304817199707, "rewards/rejected": 0.8385753035545349, "step": 3156 }, { "epoch": 1.7, "learning_rate": 6.426653581757998e-08, "logits/chosen": -2.1269936561584473, "logits/rejected": -2.2796971797943115, "logps/chosen": -0.9128168225288391, "logps/rejected": -2.1658143997192383, "loss": 0.592, "rewards/accuracies": 1.0, "rewards/chosen": 0.9788628816604614, "rewards/margins": 0.21370631456375122, "rewards/rejected": 0.7651565670967102, "step": 3157 }, { "epoch": 1.7, "learning_rate": 6.424560442910157e-08, "logits/chosen": -2.1741573810577393, "logits/rejected": -2.2924463748931885, "logps/chosen": -1.1084599494934082, "logps/rejected": -1.0854229927062988, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 1.0707050561904907, "rewards/margins": 0.011708259582519531, "rewards/rejected": 1.0589967966079712, "step": 3158 }, { "epoch": 1.7, "learning_rate": 6.42246703231798e-08, "logits/chosen": -2.036649703979492, "logits/rejected": -2.292127847671509, "logps/chosen": -8.738384246826172, "logps/rejected": -8.2415189743042, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.8067951202392578, "rewards/margins": -0.016805589199066162, "rewards/rejected": 0.823600709438324, "step": 3159 }, { "epoch": 1.7, "learning_rate": 6.4203733503808e-08, "logits/chosen": -1.9630578756332397, "logits/rejected": -2.257380723953247, "logps/chosen": -3.2456393241882324, "logps/rejected": -1.685685157775879, "loss": 0.7912, "rewards/accuracies": 0.0, "rewards/chosen": 0.6086916327476501, "rewards/margins": -0.1874104142189026, "rewards/rejected": 0.7961020469665527, "step": 3160 }, { "epoch": 1.7, "learning_rate": 6.418279397498002e-08, "logits/chosen": -2.008106231689453, "logits/rejected": -2.0075929164886475, "logps/chosen": -0.9468618035316467, "logps/rejected": -2.868746280670166, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 1.0122641324996948, "rewards/margins": 0.4589419364929199, "rewards/rejected": 0.5533221960067749, "step": 3161 }, { "epoch": 1.71, "learning_rate": 6.41618517406902e-08, "logits/chosen": -2.054004192352295, "logits/rejected": -2.0576364994049072, "logps/chosen": -1.372910737991333, "logps/rejected": -4.0211381912231445, "loss": 0.4859, "rewards/accuracies": 1.0, "rewards/chosen": 1.2070215940475464, "rewards/margins": 0.468880832195282, "rewards/rejected": 0.7381407618522644, "step": 3162 }, { "epoch": 1.71, "learning_rate": 6.414090680493342e-08, "logits/chosen": -2.074709177017212, "logits/rejected": -2.295246124267578, "logps/chosen": -8.203422546386719, "logps/rejected": -10.756340026855469, "loss": 0.5602, "rewards/accuracies": 1.0, "rewards/chosen": 0.931896984577179, "rewards/margins": 0.28635329008102417, "rewards/rejected": 0.6455436944961548, "step": 3163 }, { "epoch": 1.71, "learning_rate": 6.411995917170506e-08, "logits/chosen": -2.0354294776916504, "logits/rejected": -2.327518939971924, "logps/chosen": -1.1974200010299683, "logps/rejected": -1.3897755146026611, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 1.2356356382369995, "rewards/margins": 0.022814154624938965, "rewards/rejected": 1.2128214836120605, "step": 3164 }, { "epoch": 1.71, "learning_rate": 6.4099008845001e-08, "logits/chosen": -2.0017828941345215, "logits/rejected": -2.25012469291687, "logps/chosen": -0.8622217774391174, "logps/rejected": -0.8193727731704712, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.872786819934845, "rewards/margins": 0.003658771514892578, "rewards/rejected": 0.8691280484199524, "step": 3165 }, { "epoch": 1.71, "learning_rate": 6.407805582881766e-08, "logits/chosen": -2.0654795169830322, "logits/rejected": -2.3228394985198975, "logps/chosen": -19.041501998901367, "logps/rejected": -16.162948608398438, "loss": 0.7825, "rewards/accuracies": 0.0, "rewards/chosen": 0.0003864288446493447, "rewards/margins": -0.17130164802074432, "rewards/rejected": 0.17168807983398438, "step": 3166 }, { "epoch": 1.71, "learning_rate": 6.405710012715201e-08, "logits/chosen": -2.064058303833008, "logits/rejected": -2.0721232891082764, "logps/chosen": -2.4825592041015625, "logps/rejected": -3.470470666885376, "loss": 0.6326, "rewards/accuracies": 1.0, "rewards/chosen": 0.9253547787666321, "rewards/margins": 0.1249312162399292, "rewards/rejected": 0.8004235625267029, "step": 3167 }, { "epoch": 1.71, "learning_rate": 6.403614174400145e-08, "logits/chosen": -2.1628124713897705, "logits/rejected": -2.3258140087127686, "logps/chosen": -2.7996792793273926, "logps/rejected": -2.767380475997925, "loss": 0.6714, "rewards/accuracies": 1.0, "rewards/chosen": 0.5538408160209656, "rewards/margins": 0.04399526119232178, "rewards/rejected": 0.5098455548286438, "step": 3168 }, { "epoch": 1.71, "learning_rate": 6.401518068336395e-08, "logits/chosen": -2.013260841369629, "logits/rejected": -2.2275779247283936, "logps/chosen": -0.5653566122055054, "logps/rejected": -0.5738450884819031, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.8465620279312134, "rewards/margins": 0.0036559104919433594, "rewards/rejected": 0.84290611743927, "step": 3169 }, { "epoch": 1.71, "learning_rate": 6.399421694923793e-08, "logits/chosen": -2.00581693649292, "logits/rejected": -2.0933170318603516, "logps/chosen": -13.283336639404297, "logps/rejected": -16.42070770263672, "loss": 0.4714, "rewards/accuracies": 1.0, "rewards/chosen": 1.5595169067382812, "rewards/margins": 0.5071268081665039, "rewards/rejected": 1.0523900985717773, "step": 3170 }, { "epoch": 1.71, "learning_rate": 6.397325054562242e-08, "logits/chosen": -2.0525426864624023, "logits/rejected": -2.15447735786438, "logps/chosen": -2.704129934310913, "logps/rejected": -10.407833099365234, "loss": 0.4788, "rewards/accuracies": 1.0, "rewards/chosen": 1.2980384826660156, "rewards/margins": 0.487440288066864, "rewards/rejected": 0.8105981945991516, "step": 3171 }, { "epoch": 1.71, "learning_rate": 6.395228147651684e-08, "logits/chosen": -2.162485122680664, "logits/rejected": -2.2884163856506348, "logps/chosen": -1.2576888799667358, "logps/rejected": -1.3212939500808716, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 1.0402209758758545, "rewards/margins": -0.012624025344848633, "rewards/rejected": 1.0528450012207031, "step": 3172 }, { "epoch": 1.71, "learning_rate": 6.393130974592121e-08, "logits/chosen": -2.0778515338897705, "logits/rejected": -2.030308485031128, "logps/chosen": -27.49866485595703, "logps/rejected": -3.044874668121338, "loss": 0.3477, "rewards/accuracies": 1.0, "rewards/chosen": 1.6388046741485596, "rewards/margins": 0.8776625394821167, "rewards/rejected": 0.7611421346664429, "step": 3173 }, { "epoch": 1.71, "learning_rate": 6.391033535783605e-08, "logits/chosen": -2.152170419692993, "logits/rejected": -2.151906967163086, "logps/chosen": -1.0087841749191284, "logps/rejected": -1.6301876306533813, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.8842344284057617, "rewards/margins": -0.007573604583740234, "rewards/rejected": 0.891808032989502, "step": 3174 }, { "epoch": 1.71, "learning_rate": 6.388935831626232e-08, "logits/chosen": -2.0688905715942383, "logits/rejected": -2.066377639770508, "logps/chosen": -6.72491979598999, "logps/rejected": -1.195343017578125, "loss": 0.3412, "rewards/accuracies": 1.0, "rewards/chosen": 1.737947702407837, "rewards/margins": 0.8998660445213318, "rewards/rejected": 0.8380816578865051, "step": 3175 }, { "epoch": 1.71, "learning_rate": 6.386837862520156e-08, "logits/chosen": -2.0720109939575195, "logits/rejected": -2.06827449798584, "logps/chosen": -3.8417551517486572, "logps/rejected": -10.398967742919922, "loss": 0.511, "rewards/accuracies": 1.0, "rewards/chosen": 1.2937296628952026, "rewards/margins": 0.4050522446632385, "rewards/rejected": 0.8886774182319641, "step": 3176 }, { "epoch": 1.71, "learning_rate": 6.384739628865576e-08, "logits/chosen": -2.057080030441284, "logits/rejected": -2.0628437995910645, "logps/chosen": -3.8908796310424805, "logps/rejected": -3.5527775287628174, "loss": 0.4075, "rewards/accuracies": 1.0, "rewards/chosen": 1.2613255977630615, "rewards/margins": 0.6870008707046509, "rewards/rejected": 0.5743247270584106, "step": 3177 }, { "epoch": 1.71, "learning_rate": 6.382641131062746e-08, "logits/chosen": -1.9949079751968384, "logits/rejected": -2.286677598953247, "logps/chosen": -4.94111442565918, "logps/rejected": -7.021951675415039, "loss": 0.6682, "rewards/accuracies": 1.0, "rewards/chosen": 0.778903603553772, "rewards/margins": 0.05048173666000366, "rewards/rejected": 0.7284218668937683, "step": 3178 }, { "epoch": 1.71, "learning_rate": 6.380542369511967e-08, "logits/chosen": -2.1295974254608154, "logits/rejected": -2.120976686477661, "logps/chosen": -13.207515716552734, "logps/rejected": -0.9397093653678894, "loss": 0.4513, "rewards/accuracies": 1.0, "rewards/chosen": 1.3259133100509644, "rewards/margins": 0.5614272952079773, "rewards/rejected": 0.7644860148429871, "step": 3179 }, { "epoch": 1.72, "learning_rate": 6.378443344613594e-08, "logits/chosen": -2.085608720779419, "logits/rejected": -2.2319443225860596, "logps/chosen": -3.8936891555786133, "logps/rejected": -0.3002358675003052, "loss": 0.6468, "rewards/accuracies": 1.0, "rewards/chosen": 0.8462104201316833, "rewards/margins": 0.0948442816734314, "rewards/rejected": 0.751366138458252, "step": 3180 }, { "epoch": 1.72, "learning_rate": 6.376344056768026e-08, "logits/chosen": -1.9739166498184204, "logits/rejected": -1.974595308303833, "logps/chosen": -1.2327687740325928, "logps/rejected": -1.6820403337478638, "loss": 0.5741, "rewards/accuracies": 1.0, "rewards/chosen": 1.0066434144973755, "rewards/margins": 0.2542076110839844, "rewards/rejected": 0.7524358034133911, "step": 3181 }, { "epoch": 1.72, "learning_rate": 6.37424450637572e-08, "logits/chosen": -2.179015874862671, "logits/rejected": -2.1791045665740967, "logps/chosen": -4.845067024230957, "logps/rejected": -3.2429916858673096, "loss": 0.3576, "rewards/accuracies": 1.0, "rewards/chosen": 1.429653525352478, "rewards/margins": 0.8441755175590515, "rewards/rejected": 0.5854780077934265, "step": 3182 }, { "epoch": 1.72, "learning_rate": 6.372144693837175e-08, "logits/chosen": -2.1625351905822754, "logits/rejected": -2.154093027114868, "logps/chosen": -7.840144157409668, "logps/rejected": -0.737011194229126, "loss": 0.4754, "rewards/accuracies": 1.0, "rewards/chosen": 1.466658115386963, "rewards/margins": 0.4964402914047241, "rewards/rejected": 0.9702178239822388, "step": 3183 }, { "epoch": 1.72, "learning_rate": 6.37004461955295e-08, "logits/chosen": -2.1015145778656006, "logits/rejected": -2.1111271381378174, "logps/chosen": -5.586064338684082, "logps/rejected": -3.132451295852661, "loss": 0.309, "rewards/accuracies": 1.0, "rewards/chosen": 1.6473407745361328, "rewards/margins": 1.0159242153167725, "rewards/rejected": 0.6314164996147156, "step": 3184 }, { "epoch": 1.72, "learning_rate": 6.367944283923642e-08, "logits/chosen": -2.032785654067993, "logits/rejected": -2.0438756942749023, "logps/chosen": -1.3692954778671265, "logps/rejected": -3.3914754390716553, "loss": 0.4895, "rewards/accuracies": 1.0, "rewards/chosen": 1.0412510633468628, "rewards/margins": 0.45973706245422363, "rewards/rejected": 0.5815140008926392, "step": 3185 }, { "epoch": 1.72, "learning_rate": 6.365843687349907e-08, "logits/chosen": -2.015296220779419, "logits/rejected": -2.022786855697632, "logps/chosen": -4.187724590301514, "logps/rejected": -6.147663593292236, "loss": 0.4383, "rewards/accuracies": 1.0, "rewards/chosen": 0.8964084982872009, "rewards/margins": 0.5977074503898621, "rewards/rejected": 0.29870104789733887, "step": 3186 }, { "epoch": 1.72, "learning_rate": 6.363742830232448e-08, "logits/chosen": -2.1228864192962646, "logits/rejected": -2.1166324615478516, "logps/chosen": -13.037046432495117, "logps/rejected": -3.3641068935394287, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 1.151568055152893, "rewards/margins": 0.5764399766921997, "rewards/rejected": 0.5751280784606934, "step": 3187 }, { "epoch": 1.72, "learning_rate": 6.361641712972014e-08, "logits/chosen": -2.0863852500915527, "logits/rejected": -2.071326732635498, "logps/chosen": -0.6478764414787292, "logps/rejected": -5.218574523925781, "loss": 0.4398, "rewards/accuracies": 1.0, "rewards/chosen": 1.0928703546524048, "rewards/margins": 0.5935672521591187, "rewards/rejected": 0.49930307269096375, "step": 3188 }, { "epoch": 1.72, "learning_rate": 6.359540335969412e-08, "logits/chosen": -2.031592845916748, "logits/rejected": -2.326791763305664, "logps/chosen": -5.267905235290527, "logps/rejected": -5.161047458648682, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.3725692927837372, "rewards/margins": 0.023969799280166626, "rewards/rejected": 0.34859949350357056, "step": 3189 }, { "epoch": 1.72, "learning_rate": 6.357438699625491e-08, "logits/chosen": -2.1903679370880127, "logits/rejected": -2.154423952102661, "logps/chosen": -17.671781539916992, "logps/rejected": -3.0387537479400635, "loss": 0.2858, "rewards/accuracies": 1.0, "rewards/chosen": 1.6863244771957397, "rewards/margins": 1.1061300039291382, "rewards/rejected": 0.5801944732666016, "step": 3190 }, { "epoch": 1.72, "learning_rate": 6.355336804341151e-08, "logits/chosen": -1.9569271802902222, "logits/rejected": -1.928667664527893, "logps/chosen": -10.130045890808105, "logps/rejected": -3.344513416290283, "loss": 0.2904, "rewards/accuracies": 1.0, "rewards/chosen": 1.6057723760604858, "rewards/margins": 1.0876795053482056, "rewards/rejected": 0.5180928707122803, "step": 3191 }, { "epoch": 1.72, "learning_rate": 6.353234650517344e-08, "logits/chosen": -2.132342576980591, "logits/rejected": -2.0474700927734375, "logps/chosen": -13.581571578979492, "logps/rejected": -3.586848735809326, "loss": 0.3723, "rewards/accuracies": 1.0, "rewards/chosen": 1.6040445566177368, "rewards/margins": 0.7962180972099304, "rewards/rejected": 0.8078264594078064, "step": 3192 }, { "epoch": 1.72, "learning_rate": 6.351132238555072e-08, "logits/chosen": -2.0261123180389404, "logits/rejected": -2.297821521759033, "logps/chosen": -7.524218559265137, "logps/rejected": -6.935007572174072, "loss": 0.7065, "rewards/accuracies": 0.0, "rewards/chosen": 0.6310826539993286, "rewards/margins": -0.0266110897064209, "rewards/rejected": 0.6576937437057495, "step": 3193 }, { "epoch": 1.72, "learning_rate": 6.349029568855378e-08, "logits/chosen": -2.21103835105896, "logits/rejected": -2.226606845855713, "logps/chosen": -3.5844075679779053, "logps/rejected": -10.25919246673584, "loss": 0.4514, "rewards/accuracies": 1.0, "rewards/chosen": 1.3636105060577393, "rewards/margins": 0.5612576603889465, "rewards/rejected": 0.8023528456687927, "step": 3194 }, { "epoch": 1.72, "learning_rate": 6.346926641819365e-08, "logits/chosen": -2.15413498878479, "logits/rejected": -2.2835328578948975, "logps/chosen": -1.1388118267059326, "logps/rejected": -7.9279375076293945, "loss": 0.5471, "rewards/accuracies": 1.0, "rewards/chosen": 1.1574602127075195, "rewards/margins": 0.31713396310806274, "rewards/rejected": 0.8403262495994568, "step": 3195 }, { "epoch": 1.72, "learning_rate": 6.344823457848179e-08, "logits/chosen": -2.029935359954834, "logits/rejected": -2.025989532470703, "logps/chosen": -11.44059944152832, "logps/rejected": -5.473687171936035, "loss": 0.204, "rewards/accuracies": 1.0, "rewards/chosen": 2.178516149520874, "rewards/margins": 1.485655426979065, "rewards/rejected": 0.6928607225418091, "step": 3196 }, { "epoch": 1.72, "learning_rate": 6.342720017343016e-08, "logits/chosen": -2.1126224994659424, "logits/rejected": -2.099332094192505, "logps/chosen": -3.1508541107177734, "logps/rejected": -4.814477443695068, "loss": 0.4455, "rewards/accuracies": 1.0, "rewards/chosen": 1.036763310432434, "rewards/margins": 0.577526330947876, "rewards/rejected": 0.4592369496822357, "step": 3197 }, { "epoch": 1.72, "learning_rate": 6.340616320705121e-08, "logits/chosen": -2.143052101135254, "logits/rejected": -2.3385956287384033, "logps/chosen": -1.2557029724121094, "logps/rejected": -1.2203365564346313, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.8172222971916199, "rewards/margins": 0.0318334698677063, "rewards/rejected": 0.7853888273239136, "step": 3198 }, { "epoch": 1.73, "learning_rate": 6.338512368335789e-08, "logits/chosen": -2.0144989490509033, "logits/rejected": -2.017731189727783, "logps/chosen": -4.788379669189453, "logps/rejected": -6.51383113861084, "loss": 0.3993, "rewards/accuracies": 1.0, "rewards/chosen": 1.2739495038986206, "rewards/margins": 0.711757242679596, "rewards/rejected": 0.5621922612190247, "step": 3199 }, { "epoch": 1.73, "learning_rate": 6.336408160636362e-08, "logits/chosen": -2.234255790710449, "logits/rejected": -2.334728717803955, "logps/chosen": -0.22437609732151031, "logps/rejected": -0.1959274709224701, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 0.8061229586601257, "rewards/margins": 0.049414098262786865, "rewards/rejected": 0.7567088603973389, "step": 3200 }, { "epoch": 1.73, "learning_rate": 6.334303698008231e-08, "logits/chosen": -2.0662457942962646, "logits/rejected": -2.0610249042510986, "logps/chosen": -10.42973804473877, "logps/rejected": -5.361979007720947, "loss": 0.2692, "rewards/accuracies": 1.0, "rewards/chosen": 1.799174189567566, "rewards/margins": 1.1744780540466309, "rewards/rejected": 0.6246961355209351, "step": 3201 }, { "epoch": 1.73, "learning_rate": 6.332198980852838e-08, "logits/chosen": -2.1223080158233643, "logits/rejected": -2.135228157043457, "logps/chosen": -7.302268028259277, "logps/rejected": -1.5930330753326416, "loss": 0.56, "rewards/accuracies": 1.0, "rewards/chosen": 1.090497374534607, "rewards/margins": 0.2867305278778076, "rewards/rejected": 0.8037668466567993, "step": 3202 }, { "epoch": 1.73, "learning_rate": 6.33009400957167e-08, "logits/chosen": -2.1866872310638428, "logits/rejected": -2.2834789752960205, "logps/chosen": -1.4597469568252563, "logps/rejected": -1.409523606300354, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0370203256607056, "rewards/margins": 0.01861858367919922, "rewards/rejected": 1.0184017419815063, "step": 3203 }, { "epoch": 1.73, "learning_rate": 6.327988784566265e-08, "logits/chosen": -2.1339309215545654, "logits/rejected": -2.1290197372436523, "logps/chosen": -4.649903774261475, "logps/rejected": -6.944904327392578, "loss": 0.6207, "rewards/accuracies": 1.0, "rewards/chosen": 0.9242555499076843, "rewards/margins": 0.1506408452987671, "rewards/rejected": 0.7736147046089172, "step": 3204 }, { "epoch": 1.73, "learning_rate": 6.325883306238208e-08, "logits/chosen": -2.0822529792785645, "logits/rejected": -2.30424165725708, "logps/chosen": -1.351627230644226, "logps/rejected": -1.318968415260315, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0568150281906128, "rewards/margins": 0.02402472496032715, "rewards/rejected": 1.0327903032302856, "step": 3205 }, { "epoch": 1.73, "learning_rate": 6.323777574989134e-08, "logits/chosen": -2.2909798622131348, "logits/rejected": -2.2813668251037598, "logps/chosen": -0.514015793800354, "logps/rejected": -0.5332152247428894, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.942550003528595, "rewards/margins": 0.00544893741607666, "rewards/rejected": 0.9371010661125183, "step": 3206 }, { "epoch": 1.73, "learning_rate": 6.321671591220725e-08, "logits/chosen": -2.2557735443115234, "logits/rejected": -2.387608289718628, "logps/chosen": -0.6283190250396729, "logps/rejected": -0.7179455757141113, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8532717823982239, "rewards/margins": 0.004236102104187012, "rewards/rejected": 0.8490356802940369, "step": 3207 }, { "epoch": 1.73, "learning_rate": 6.319565355334711e-08, "logits/chosen": -2.0314881801605225, "logits/rejected": -2.277463912963867, "logps/chosen": -0.45393794775009155, "logps/rejected": -0.4954577386379242, "loss": 0.6701, "rewards/accuracies": 1.0, "rewards/chosen": 0.9573850631713867, "rewards/margins": 0.04654818773269653, "rewards/rejected": 0.9108368754386902, "step": 3208 }, { "epoch": 1.73, "learning_rate": 6.317458867732869e-08, "logits/chosen": -2.0535647869110107, "logits/rejected": -2.2928683757781982, "logps/chosen": -0.9585028290748596, "logps/rejected": -0.9531315565109253, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.890040397644043, "rewards/margins": 0.011303603649139404, "rewards/rejected": 0.8787367939949036, "step": 3209 }, { "epoch": 1.73, "learning_rate": 6.315352128817028e-08, "logits/chosen": -2.187396764755249, "logits/rejected": -2.079197645187378, "logps/chosen": -38.418052673339844, "logps/rejected": -2.3010387420654297, "loss": 0.1502, "rewards/accuracies": 1.0, "rewards/chosen": 2.438988447189331, "rewards/margins": 1.8200533390045166, "rewards/rejected": 0.6189351081848145, "step": 3210 }, { "epoch": 1.73, "learning_rate": 6.31324513898906e-08, "logits/chosen": -2.0587072372436523, "logits/rejected": -2.050497531890869, "logps/chosen": -3.8963375091552734, "logps/rejected": -6.561491012573242, "loss": 0.3182, "rewards/accuracies": 1.0, "rewards/chosen": 1.4188984632492065, "rewards/margins": 0.9817179441452026, "rewards/rejected": 0.4371805191040039, "step": 3211 }, { "epoch": 1.73, "learning_rate": 6.31113789865089e-08, "logits/chosen": -2.0754733085632324, "logits/rejected": -2.270763874053955, "logps/chosen": -0.22378750145435333, "logps/rejected": -0.29070353507995605, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.8735861778259277, "rewards/margins": 0.019976317882537842, "rewards/rejected": 0.8536098599433899, "step": 3212 }, { "epoch": 1.73, "learning_rate": 6.309030408204484e-08, "logits/chosen": -1.970454216003418, "logits/rejected": -2.2533836364746094, "logps/chosen": -0.30895596742630005, "logps/rejected": -0.22928746044635773, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.9018260836601257, "rewards/margins": 0.04210144281387329, "rewards/rejected": 0.8597246408462524, "step": 3213 }, { "epoch": 1.73, "learning_rate": 6.306922668051865e-08, "logits/chosen": -2.0646791458129883, "logits/rejected": -2.319915533065796, "logps/chosen": -0.6900356411933899, "logps/rejected": -0.6263958215713501, "loss": 0.6728, "rewards/accuracies": 1.0, "rewards/chosen": 1.026843547821045, "rewards/margins": 0.041185081005096436, "rewards/rejected": 0.9856584668159485, "step": 3214 }, { "epoch": 1.73, "learning_rate": 6.304814678595094e-08, "logits/chosen": -2.073244571685791, "logits/rejected": -2.2471885681152344, "logps/chosen": -0.8347634077072144, "logps/rejected": -0.8936667442321777, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8813678026199341, "rewards/margins": 0.015159904956817627, "rewards/rejected": 0.8662078976631165, "step": 3215 }, { "epoch": 1.73, "learning_rate": 6.302706440236288e-08, "logits/chosen": -2.130303382873535, "logits/rejected": -2.3242828845977783, "logps/chosen": -7.821139335632324, "logps/rejected": -8.134187698364258, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.9949849247932434, "rewards/margins": 0.02382880449295044, "rewards/rejected": 0.971156120300293, "step": 3216 }, { "epoch": 1.74, "learning_rate": 6.300597953377603e-08, "logits/chosen": -2.0863423347473145, "logits/rejected": -2.2713892459869385, "logps/chosen": -1.139186143875122, "logps/rejected": -1.1322882175445557, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.7237339019775391, "rewards/margins": 0.023069560527801514, "rewards/rejected": 0.7006643414497375, "step": 3217 }, { "epoch": 1.74, "learning_rate": 6.298489218421248e-08, "logits/chosen": -2.1014654636383057, "logits/rejected": -2.101158857345581, "logps/chosen": -1.306103229522705, "logps/rejected": -2.654489517211914, "loss": 0.5976, "rewards/accuracies": 1.0, "rewards/chosen": 0.9734640121459961, "rewards/margins": 0.2012903094291687, "rewards/rejected": 0.7721737027168274, "step": 3218 }, { "epoch": 1.74, "learning_rate": 6.29638023576948e-08, "logits/chosen": -2.234189510345459, "logits/rejected": -2.1002349853515625, "logps/chosen": -33.04450988769531, "logps/rejected": -1.140605092048645, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.941256046295166, "rewards/margins": 2.2056918144226074, "rewards/rejected": 0.7355643510818481, "step": 3219 }, { "epoch": 1.74, "learning_rate": 6.294271005824601e-08, "logits/chosen": -2.068027973175049, "logits/rejected": -2.2371418476104736, "logps/chosen": -3.058845281600952, "logps/rejected": -2.5092852115631104, "loss": 0.7067, "rewards/accuracies": 0.0, "rewards/chosen": 0.6721075177192688, "rewards/margins": -0.02691805362701416, "rewards/rejected": 0.699025571346283, "step": 3220 }, { "epoch": 1.74, "learning_rate": 6.29216152898896e-08, "logits/chosen": -2.1208295822143555, "logits/rejected": -2.135477304458618, "logps/chosen": -2.056925058364868, "logps/rejected": -5.909294605255127, "loss": 0.3346, "rewards/accuracies": 1.0, "rewards/chosen": 1.3702211380004883, "rewards/margins": 0.9227983951568604, "rewards/rejected": 0.44742274284362793, "step": 3221 }, { "epoch": 1.74, "learning_rate": 6.290051805664954e-08, "logits/chosen": -2.019554615020752, "logits/rejected": -2.0212905406951904, "logps/chosen": -1.3723965883255005, "logps/rejected": -2.5609047412872314, "loss": 0.5411, "rewards/accuracies": 1.0, "rewards/chosen": 0.9420585036277771, "rewards/margins": 0.33143502473831177, "rewards/rejected": 0.6106234788894653, "step": 3222 }, { "epoch": 1.74, "learning_rate": 6.287941836255026e-08, "logits/chosen": -2.017852544784546, "logits/rejected": -2.0339317321777344, "logps/chosen": -3.181950569152832, "logps/rejected": -8.13992691040039, "loss": 0.4397, "rewards/accuracies": 1.0, "rewards/chosen": 1.134379506111145, "rewards/margins": 0.5938410758972168, "rewards/rejected": 0.5405384302139282, "step": 3223 }, { "epoch": 1.74, "learning_rate": 6.285831621161666e-08, "logits/chosen": -2.1264431476593018, "logits/rejected": -2.3261544704437256, "logps/chosen": -3.7265796661376953, "logps/rejected": -1.183214545249939, "loss": 0.6655, "rewards/accuracies": 1.0, "rewards/chosen": 0.9406211972236633, "rewards/margins": 0.056025683879852295, "rewards/rejected": 0.884595513343811, "step": 3224 }, { "epoch": 1.74, "learning_rate": 6.283721160787411e-08, "logits/chosen": -2.0564820766448975, "logits/rejected": -2.043490171432495, "logps/chosen": -20.1746883392334, "logps/rejected": -1.117223858833313, "loss": 0.2476, "rewards/accuracies": 1.0, "rewards/chosen": 2.112905263900757, "rewards/margins": 1.2697713375091553, "rewards/rejected": 0.8431339263916016, "step": 3225 }, { "epoch": 1.74, "learning_rate": 6.28161045553485e-08, "logits/chosen": -2.0418789386749268, "logits/rejected": -2.306657075881958, "logps/chosen": -2.174208879470825, "logps/rejected": -2.0102195739746094, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 1.2067140340805054, "rewards/margins": 0.03146636486053467, "rewards/rejected": 1.1752476692199707, "step": 3226 }, { "epoch": 1.74, "learning_rate": 6.279499505806609e-08, "logits/chosen": -2.086496353149414, "logits/rejected": -2.0892183780670166, "logps/chosen": -1.5244046449661255, "logps/rejected": -2.2313051223754883, "loss": 0.4985, "rewards/accuracies": 1.0, "rewards/chosen": 1.2492008209228516, "rewards/margins": 0.43661659955978394, "rewards/rejected": 0.8125842213630676, "step": 3227 }, { "epoch": 1.74, "learning_rate": 6.277388312005367e-08, "logits/chosen": -2.1357545852661133, "logits/rejected": -1.940796136856079, "logps/chosen": -36.91264724731445, "logps/rejected": -2.835114002227783, "loss": 0.1818, "rewards/accuracies": 1.0, "rewards/chosen": 2.2424144744873047, "rewards/margins": 1.61279296875, "rewards/rejected": 0.6296214461326599, "step": 3228 }, { "epoch": 1.74, "learning_rate": 6.27527687453385e-08, "logits/chosen": -2.0509567260742188, "logits/rejected": -2.1666409969329834, "logps/chosen": -0.48343825340270996, "logps/rejected": -17.02775764465332, "loss": 0.4969, "rewards/accuracies": 1.0, "rewards/chosen": 0.9766538739204407, "rewards/margins": 0.440738320350647, "rewards/rejected": 0.5359155535697937, "step": 3229 }, { "epoch": 1.74, "learning_rate": 6.273165193794828e-08, "logits/chosen": -2.0665230751037598, "logits/rejected": -2.0650997161865234, "logps/chosen": -0.8658161163330078, "logps/rejected": -5.134613990783691, "loss": 0.4351, "rewards/accuracies": 1.0, "rewards/chosen": 1.1102455854415894, "rewards/margins": 0.6066648960113525, "rewards/rejected": 0.5035806894302368, "step": 3230 }, { "epoch": 1.74, "learning_rate": 6.271053270191116e-08, "logits/chosen": -2.1504571437835693, "logits/rejected": -2.1494266986846924, "logps/chosen": -3.3219404220581055, "logps/rejected": -2.3825843334198, "loss": 0.4468, "rewards/accuracies": 1.0, "rewards/chosen": 1.3735156059265137, "rewards/margins": 0.5739056468009949, "rewards/rejected": 0.7996099591255188, "step": 3231 }, { "epoch": 1.74, "learning_rate": 6.26894110412558e-08, "logits/chosen": -2.021237850189209, "logits/rejected": -2.019500970840454, "logps/chosen": -0.2687680423259735, "logps/rejected": -4.116837978363037, "loss": 0.4867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9731095433235168, "rewards/margins": 0.46689504384994507, "rewards/rejected": 0.5062144994735718, "step": 3232 }, { "epoch": 1.74, "learning_rate": 6.266828696001127e-08, "logits/chosen": -2.018606662750244, "logits/rejected": -2.1860392093658447, "logps/chosen": -0.28652581572532654, "logps/rejected": -0.3213192820549011, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8275681734085083, "rewards/margins": 0.022297680377960205, "rewards/rejected": 0.8052704930305481, "step": 3233 }, { "epoch": 1.74, "learning_rate": 6.264716046220713e-08, "logits/chosen": -2.007756233215332, "logits/rejected": -1.9976305961608887, "logps/chosen": -5.351100921630859, "logps/rejected": -4.399181365966797, "loss": 0.4676, "rewards/accuracies": 1.0, "rewards/chosen": 1.1431411504745483, "rewards/margins": 0.5172690153121948, "rewards/rejected": 0.6258721351623535, "step": 3234 }, { "epoch": 1.74, "learning_rate": 6.262603155187343e-08, "logits/chosen": -2.0706567764282227, "logits/rejected": -2.0700907707214355, "logps/chosen": -1.3348243236541748, "logps/rejected": -3.0206193923950195, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": 0.9971732497215271, "rewards/margins": 0.18062889575958252, "rewards/rejected": 0.8165443539619446, "step": 3235 }, { "epoch": 1.75, "learning_rate": 6.260490023304062e-08, "logits/chosen": -1.959072470664978, "logits/rejected": -1.9711002111434937, "logps/chosen": -6.195734024047852, "logps/rejected": -4.628762245178223, "loss": 0.3646, "rewards/accuracies": 1.0, "rewards/chosen": 1.484460473060608, "rewards/margins": 0.8212558031082153, "rewards/rejected": 0.6632046699523926, "step": 3236 }, { "epoch": 1.75, "learning_rate": 6.258376650973966e-08, "logits/chosen": -2.045748233795166, "logits/rejected": -2.289614677429199, "logps/chosen": -9.769861221313477, "logps/rejected": -8.715131759643555, "loss": 0.753, "rewards/accuracies": 0.0, "rewards/chosen": 0.7148413062095642, "rewards/margins": -0.11639821529388428, "rewards/rejected": 0.8312395215034485, "step": 3237 }, { "epoch": 1.75, "learning_rate": 6.256263038600194e-08, "logits/chosen": -2.0915300846099854, "logits/rejected": -2.0933659076690674, "logps/chosen": -0.42527833580970764, "logps/rejected": -3.2423107624053955, "loss": 0.4825, "rewards/accuracies": 1.0, "rewards/chosen": 1.0147684812545776, "rewards/margins": 0.47787660360336304, "rewards/rejected": 0.5368918776512146, "step": 3238 }, { "epoch": 1.75, "learning_rate": 6.25414918658593e-08, "logits/chosen": -2.0305659770965576, "logits/rejected": -2.324252128601074, "logps/chosen": -0.5608856678009033, "logps/rejected": -5.1946892738342285, "loss": 0.5132, "rewards/accuracies": 1.0, "rewards/chosen": 1.1364177465438843, "rewards/margins": 0.3995741009712219, "rewards/rejected": 0.7368436455726624, "step": 3239 }, { "epoch": 1.75, "learning_rate": 6.252035095334407e-08, "logits/chosen": -2.0623886585235596, "logits/rejected": -2.254854440689087, "logps/chosen": -1.5832306146621704, "logps/rejected": -1.708317518234253, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.6328508257865906, "rewards/margins": 0.021537482738494873, "rewards/rejected": 0.6113133430480957, "step": 3240 }, { "epoch": 1.75, "learning_rate": 6.249920765248902e-08, "logits/chosen": -2.0732767581939697, "logits/rejected": -2.075176477432251, "logps/chosen": -4.452720642089844, "logps/rejected": -2.4468119144439697, "loss": 0.3609, "rewards/accuracies": 1.0, "rewards/chosen": 1.6286144256591797, "rewards/margins": 0.8333421349525452, "rewards/rejected": 0.7952722907066345, "step": 3241 }, { "epoch": 1.75, "learning_rate": 6.247806196732737e-08, "logits/chosen": -2.1472487449645996, "logits/rejected": -2.040992259979248, "logps/chosen": -22.250045776367188, "logps/rejected": -1.2170778512954712, "loss": 0.276, "rewards/accuracies": 1.0, "rewards/chosen": 2.0104310512542725, "rewards/margins": 1.1460902690887451, "rewards/rejected": 0.8643407225608826, "step": 3242 }, { "epoch": 1.75, "learning_rate": 6.24569139018928e-08, "logits/chosen": -2.1625187397003174, "logits/rejected": -2.1654341220855713, "logps/chosen": -3.7949438095092773, "logps/rejected": -3.9906270503997803, "loss": 0.4547, "rewards/accuracies": 1.0, "rewards/chosen": 1.0963982343673706, "rewards/margins": 0.552070677280426, "rewards/rejected": 0.5443275570869446, "step": 3243 }, { "epoch": 1.75, "learning_rate": 6.243576346021943e-08, "logits/chosen": -1.944151759147644, "logits/rejected": -2.344541072845459, "logps/chosen": -5.543309688568115, "logps/rejected": -5.562114715576172, "loss": 0.7017, "rewards/accuracies": 0.0, "rewards/chosen": 0.47484955191612244, "rewards/margins": -0.01700693368911743, "rewards/rejected": 0.49185648560523987, "step": 3244 }, { "epoch": 1.75, "learning_rate": 6.241461064634187e-08, "logits/chosen": -2.1091420650482178, "logits/rejected": -2.2197070121765137, "logps/chosen": -18.284093856811523, "logps/rejected": -21.41080665588379, "loss": 0.3085, "rewards/accuracies": 1.0, "rewards/chosen": 1.7162344455718994, "rewards/margins": 1.017728328704834, "rewards/rejected": 0.6985061764717102, "step": 3245 }, { "epoch": 1.75, "learning_rate": 6.239345546429516e-08, "logits/chosen": -2.239577054977417, "logits/rejected": -2.1529476642608643, "logps/chosen": -21.121192932128906, "logps/rejected": -3.442927122116089, "loss": 0.255, "rewards/accuracies": 1.0, "rewards/chosen": 1.943280816078186, "rewards/margins": 1.2361679077148438, "rewards/rejected": 0.7071129679679871, "step": 3246 }, { "epoch": 1.75, "learning_rate": 6.237229791811476e-08, "logits/chosen": -2.0398359298706055, "logits/rejected": -2.253251075744629, "logps/chosen": -2.3285653591156006, "logps/rejected": -0.5814509987831116, "loss": 0.7159, "rewards/accuracies": 0.0, "rewards/chosen": 1.0089120864868164, "rewards/margins": -0.04498434066772461, "rewards/rejected": 1.053896427154541, "step": 3247 }, { "epoch": 1.75, "learning_rate": 6.235113801183665e-08, "logits/chosen": -1.9371464252471924, "logits/rejected": -2.2643017768859863, "logps/chosen": -2.9810993671417236, "logps/rejected": -1.915582299232483, "loss": 0.7466, "rewards/accuracies": 0.0, "rewards/chosen": 0.7479241490364075, "rewards/margins": -0.10416853427886963, "rewards/rejected": 0.8520926833152771, "step": 3248 }, { "epoch": 1.75, "learning_rate": 6.232997574949719e-08, "logits/chosen": -2.195988178253174, "logits/rejected": -2.2009432315826416, "logps/chosen": -0.2440732717514038, "logps/rejected": -4.226179599761963, "loss": 0.4657, "rewards/accuracies": 1.0, "rewards/chosen": 0.9326421618461609, "rewards/margins": 0.5223069190979004, "rewards/rejected": 0.4103352129459381, "step": 3249 }, { "epoch": 1.75, "learning_rate": 6.230881113513324e-08, "logits/chosen": -2.0912177562713623, "logits/rejected": -2.2528412342071533, "logps/chosen": -2.907576322555542, "logps/rejected": -2.8159868717193604, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.6619022488594055, "rewards/margins": 0.023616671562194824, "rewards/rejected": 0.6382855772972107, "step": 3250 }, { "epoch": 1.75, "learning_rate": 6.228764417278206e-08, "logits/chosen": -2.081376791000366, "logits/rejected": -2.0806286334991455, "logps/chosen": -4.921653747558594, "logps/rejected": -5.041207790374756, "loss": 0.5268, "rewards/accuracies": 1.0, "rewards/chosen": 1.0393877029418945, "rewards/margins": 0.36606621742248535, "rewards/rejected": 0.6733214855194092, "step": 3251 }, { "epoch": 1.75, "learning_rate": 6.226647486648144e-08, "logits/chosen": -2.0681257247924805, "logits/rejected": -2.0753931999206543, "logps/chosen": -0.5379035472869873, "logps/rejected": -7.217770576477051, "loss": 0.4017, "rewards/accuracies": 1.0, "rewards/chosen": 1.1687030792236328, "rewards/margins": 0.7046067714691162, "rewards/rejected": 0.4640962779521942, "step": 3252 }, { "epoch": 1.75, "learning_rate": 6.224530322026953e-08, "logits/chosen": -2.032925605773926, "logits/rejected": -2.2651922702789307, "logps/chosen": -3.8469793796539307, "logps/rejected": -0.6011066436767578, "loss": 0.6303, "rewards/accuracies": 1.0, "rewards/chosen": 1.1282546520233154, "rewards/margins": 0.12984907627105713, "rewards/rejected": 0.9984055757522583, "step": 3253 }, { "epoch": 1.76, "learning_rate": 6.222412923818498e-08, "logits/chosen": -1.9436631202697754, "logits/rejected": -2.2447595596313477, "logps/chosen": -2.802231788635254, "logps/rejected": -0.8588107824325562, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9059864282608032, "rewards/margins": 0.01812678575515747, "rewards/rejected": 0.8878596425056458, "step": 3254 }, { "epoch": 1.76, "learning_rate": 6.220295292426687e-08, "logits/chosen": -2.0789051055908203, "logits/rejected": -2.034029960632324, "logps/chosen": -28.77303123474121, "logps/rejected": -1.9630329608917236, "loss": 0.2946, "rewards/accuracies": 1.0, "rewards/chosen": 1.87905752658844, "rewards/margins": 1.0710678100585938, "rewards/rejected": 0.807989776134491, "step": 3255 }, { "epoch": 1.76, "learning_rate": 6.218177428255467e-08, "logits/chosen": -2.079737663269043, "logits/rejected": -2.2808444499969482, "logps/chosen": -2.5924861431121826, "logps/rejected": -3.3315858840942383, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.7994315028190613, "rewards/margins": 0.035145342350006104, "rewards/rejected": 0.7642861604690552, "step": 3256 }, { "epoch": 1.76, "learning_rate": 6.216059331708841e-08, "logits/chosen": -2.112290859222412, "logits/rejected": -2.09375, "logps/chosen": -15.649429321289062, "logps/rejected": -1.728232979774475, "loss": 0.4037, "rewards/accuracies": 1.0, "rewards/chosen": 1.218470573425293, "rewards/margins": 0.6985217928886414, "rewards/rejected": 0.5199487805366516, "step": 3257 }, { "epoch": 1.76, "learning_rate": 6.213941003190848e-08, "logits/chosen": -2.134889841079712, "logits/rejected": -2.3703434467315674, "logps/chosen": -3.1813862323760986, "logps/rejected": -3.058927536010742, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.9511818289756775, "rewards/margins": -0.0074700117111206055, "rewards/rejected": 0.9586518406867981, "step": 3258 }, { "epoch": 1.76, "learning_rate": 6.211822443105573e-08, "logits/chosen": -2.0997979640960693, "logits/rejected": -2.1071813106536865, "logps/chosen": -3.8735976219177246, "logps/rejected": -4.311367988586426, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 1.2902498245239258, "rewards/margins": 0.5633788108825684, "rewards/rejected": 0.7268710136413574, "step": 3259 }, { "epoch": 1.76, "learning_rate": 6.209703651857144e-08, "logits/chosen": -2.0508337020874023, "logits/rejected": -1.9978333711624146, "logps/chosen": -8.271964073181152, "logps/rejected": -7.651916980743408, "loss": 0.4231, "rewards/accuracies": 1.0, "rewards/chosen": 1.4645341634750366, "rewards/margins": 0.6412658095359802, "rewards/rejected": 0.8232683539390564, "step": 3260 }, { "epoch": 1.76, "learning_rate": 6.207584629849735e-08, "logits/chosen": -1.9630857706069946, "logits/rejected": -2.2616636753082275, "logps/chosen": -2.449111223220825, "logps/rejected": -2.5478978157043457, "loss": 0.7017, "rewards/accuracies": 0.0, "rewards/chosen": 0.8665705919265747, "rewards/margins": -0.01694709062576294, "rewards/rejected": 0.8835176825523376, "step": 3261 }, { "epoch": 1.76, "learning_rate": 6.205465377487566e-08, "logits/chosen": -2.1609623432159424, "logits/rejected": -2.161450147628784, "logps/chosen": -2.998309850692749, "logps/rejected": -3.213967800140381, "loss": 0.2883, "rewards/accuracies": 1.0, "rewards/chosen": 1.6894947290420532, "rewards/margins": 1.0961949825286865, "rewards/rejected": 0.5932997465133667, "step": 3262 }, { "epoch": 1.76, "learning_rate": 6.203345895174895e-08, "logits/chosen": -2.2153072357177734, "logits/rejected": -2.2432825565338135, "logps/chosen": -18.245262145996094, "logps/rejected": -14.074420928955078, "loss": 0.475, "rewards/accuracies": 1.0, "rewards/chosen": 1.7782715559005737, "rewards/margins": 0.49742746353149414, "rewards/rejected": 1.2808440923690796, "step": 3263 }, { "epoch": 1.76, "learning_rate": 6.20122618331603e-08, "logits/chosen": -2.1048686504364014, "logits/rejected": -2.276557683944702, "logps/chosen": -11.884239196777344, "logps/rejected": -5.789351940155029, "loss": 0.6724, "rewards/accuracies": 1.0, "rewards/chosen": 0.8996431231498718, "rewards/margins": 0.041987597942352295, "rewards/rejected": 0.8576555252075195, "step": 3264 }, { "epoch": 1.76, "learning_rate": 6.199106242315318e-08, "logits/chosen": -2.0461812019348145, "logits/rejected": -2.0400431156158447, "logps/chosen": -5.442716121673584, "logps/rejected": -2.002063274383545, "loss": 0.3565, "rewards/accuracies": 1.0, "rewards/chosen": 1.6963627338409424, "rewards/margins": 0.8479326367378235, "rewards/rejected": 0.8484300971031189, "step": 3265 }, { "epoch": 1.76, "learning_rate": 6.196986072577153e-08, "logits/chosen": -2.014801263809204, "logits/rejected": -2.267388105392456, "logps/chosen": -1.7690128087997437, "logps/rejected": -1.4366904497146606, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.8619722723960876, "rewards/margins": -0.004632711410522461, "rewards/rejected": 0.8666049838066101, "step": 3266 }, { "epoch": 1.76, "learning_rate": 6.194865674505969e-08, "logits/chosen": -2.0160744190216064, "logits/rejected": -2.2614340782165527, "logps/chosen": -0.19911058247089386, "logps/rejected": -0.22503824532032013, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9488635063171387, "rewards/margins": 0.016616880893707275, "rewards/rejected": 0.9322466254234314, "step": 3267 }, { "epoch": 1.76, "learning_rate": 6.19274504850625e-08, "logits/chosen": -2.1085383892059326, "logits/rejected": -2.1035759449005127, "logps/chosen": -2.548888683319092, "logps/rejected": -7.048242092132568, "loss": 0.4084, "rewards/accuracies": 1.0, "rewards/chosen": 1.1083366870880127, "rewards/margins": 0.6844367980957031, "rewards/rejected": 0.42389988899230957, "step": 3268 }, { "epoch": 1.76, "learning_rate": 6.190624194982517e-08, "logits/chosen": -2.1302878856658936, "logits/rejected": -2.280083179473877, "logps/chosen": -3.1397931575775146, "logps/rejected": -3.199784517288208, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.7819898724555969, "rewards/margins": 0.021125316619873047, "rewards/rejected": 0.7608645558357239, "step": 3269 }, { "epoch": 1.76, "learning_rate": 6.188503114339337e-08, "logits/chosen": -2.06489896774292, "logits/rejected": -2.0704188346862793, "logps/chosen": -0.4523007273674011, "logps/rejected": -5.478265762329102, "loss": 0.4518, "rewards/accuracies": 1.0, "rewards/chosen": 0.977348804473877, "rewards/margins": 0.5602478981018066, "rewards/rejected": 0.4171009063720703, "step": 3270 }, { "epoch": 1.76, "learning_rate": 6.186381806981322e-08, "logits/chosen": -2.0955684185028076, "logits/rejected": -2.339791774749756, "logps/chosen": -0.8649486899375916, "logps/rejected": -0.8691890239715576, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8212982416152954, "rewards/margins": 0.019217610359191895, "rewards/rejected": 0.8020806312561035, "step": 3271 }, { "epoch": 1.76, "learning_rate": 6.184260273313121e-08, "logits/chosen": -1.9952539205551147, "logits/rejected": -2.253797769546509, "logps/chosen": -1.5861177444458008, "logps/rejected": -1.3132708072662354, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.7055408358573914, "rewards/margins": -0.0027761459350585938, "rewards/rejected": 0.70831698179245, "step": 3272 }, { "epoch": 1.77, "learning_rate": 6.182138513739434e-08, "logits/chosen": -2.025252342224121, "logits/rejected": -2.264810085296631, "logps/chosen": -1.3491456508636475, "logps/rejected": -4.013575077056885, "loss": 0.6184, "rewards/accuracies": 1.0, "rewards/chosen": 0.964345395565033, "rewards/margins": 0.15563321113586426, "rewards/rejected": 0.8087121844291687, "step": 3273 }, { "epoch": 1.77, "learning_rate": 6.180016528665e-08, "logits/chosen": -2.091087818145752, "logits/rejected": -2.0872440338134766, "logps/chosen": -13.125203132629395, "logps/rejected": -1.338679552078247, "loss": 0.8303, "rewards/accuracies": 0.0, "rewards/chosen": 0.7058970332145691, "rewards/margins": -0.2576819658279419, "rewards/rejected": 0.963578999042511, "step": 3274 }, { "epoch": 1.77, "learning_rate": 6.177894318494604e-08, "logits/chosen": -2.142688751220703, "logits/rejected": -2.241455078125, "logps/chosen": -1.4305514097213745, "logps/rejected": -1.463835597038269, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9985523223876953, "rewards/margins": 0.013728201389312744, "rewards/rejected": 0.9848241209983826, "step": 3275 }, { "epoch": 1.77, "learning_rate": 6.175771883633067e-08, "logits/chosen": -2.188112258911133, "logits/rejected": -2.1906347274780273, "logps/chosen": -1.9408912658691406, "logps/rejected": -0.8096373677253723, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": 1.1777355670928955, "rewards/margins": 0.1051708459854126, "rewards/rejected": 1.072564721107483, "step": 3276 }, { "epoch": 1.77, "learning_rate": 6.173649224485262e-08, "logits/chosen": -2.1847612857818604, "logits/rejected": -2.180039644241333, "logps/chosen": -5.761869430541992, "logps/rejected": -5.36201286315918, "loss": 0.2871, "rewards/accuracies": 1.0, "rewards/chosen": 1.4728339910507202, "rewards/margins": 1.1011157035827637, "rewards/rejected": 0.37171831727027893, "step": 3277 }, { "epoch": 1.77, "learning_rate": 6.171526341456099e-08, "logits/chosen": -2.160139560699463, "logits/rejected": -2.323833465576172, "logps/chosen": -0.5981830358505249, "logps/rejected": -0.5978612899780273, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9575872421264648, "rewards/margins": 0.0077825188636779785, "rewards/rejected": 0.9498047232627869, "step": 3278 }, { "epoch": 1.77, "learning_rate": 6.169403234950528e-08, "logits/chosen": -2.027994394302368, "logits/rejected": -2.0364601612091064, "logps/chosen": -1.6750942468643188, "logps/rejected": -3.213832378387451, "loss": 0.4878, "rewards/accuracies": 1.0, "rewards/chosen": 0.9797313809394836, "rewards/margins": 0.46403366327285767, "rewards/rejected": 0.515697717666626, "step": 3279 }, { "epoch": 1.77, "learning_rate": 6.167279905373552e-08, "logits/chosen": -2.062011957168579, "logits/rejected": -2.0743353366851807, "logps/chosen": -2.4688720703125, "logps/rejected": -1.9825831651687622, "loss": 0.5526, "rewards/accuracies": 1.0, "rewards/chosen": 1.0245821475982666, "rewards/margins": 0.3040313124656677, "rewards/rejected": 0.7205508351325989, "step": 3280 }, { "epoch": 1.77, "learning_rate": 6.165156353130204e-08, "logits/chosen": -2.18916916847229, "logits/rejected": -2.3027195930480957, "logps/chosen": -6.372010231018066, "logps/rejected": -30.793790817260742, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": 1.3659900426864624, "rewards/margins": 0.7291463613510132, "rewards/rejected": 0.6368436813354492, "step": 3281 }, { "epoch": 1.77, "learning_rate": 6.16303257862557e-08, "logits/chosen": -2.0901591777801514, "logits/rejected": -2.2836732864379883, "logps/chosen": -1.8237407207489014, "logps/rejected": -1.6772531270980835, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 1.076849341392517, "rewards/margins": -0.001059889793395996, "rewards/rejected": 1.077909231185913, "step": 3282 }, { "epoch": 1.77, "learning_rate": 6.160908582264774e-08, "logits/chosen": -2.1036376953125, "logits/rejected": -2.124051094055176, "logps/chosen": -2.023397922515869, "logps/rejected": -3.234309196472168, "loss": 0.5639, "rewards/accuracies": 1.0, "rewards/chosen": 1.057564377784729, "rewards/margins": 0.2776123285293579, "rewards/rejected": 0.7799520492553711, "step": 3283 }, { "epoch": 1.77, "learning_rate": 6.15878436445298e-08, "logits/chosen": -2.098205804824829, "logits/rejected": -2.3038458824157715, "logps/chosen": -4.511847496032715, "logps/rejected": -4.587774276733398, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.7552165389060974, "rewards/margins": 0.03549051284790039, "rewards/rejected": 0.719726026058197, "step": 3284 }, { "epoch": 1.77, "learning_rate": 6.156659925595398e-08, "logits/chosen": -2.103205680847168, "logits/rejected": -2.274010419845581, "logps/chosen": -2.213310718536377, "logps/rejected": -0.6902393698692322, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8762218356132507, "rewards/margins": 0.022751986980438232, "rewards/rejected": 0.8534698486328125, "step": 3285 }, { "epoch": 1.77, "learning_rate": 6.154535266097278e-08, "logits/chosen": -2.1197702884674072, "logits/rejected": -2.3343944549560547, "logps/chosen": -11.299875259399414, "logps/rejected": -10.108549118041992, "loss": 0.4412, "rewards/accuracies": 1.0, "rewards/chosen": 1.3453022241592407, "rewards/margins": 0.5895679593086243, "rewards/rejected": 0.7557342648506165, "step": 3286 }, { "epoch": 1.77, "learning_rate": 6.152410386363913e-08, "logits/chosen": -2.0844180583953857, "logits/rejected": -2.323483467102051, "logps/chosen": -0.4837232232093811, "logps/rejected": -0.4505513310432434, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.8825429081916809, "rewards/margins": -0.009404301643371582, "rewards/rejected": 0.8919472098350525, "step": 3287 }, { "epoch": 1.77, "learning_rate": 6.15028528680064e-08, "logits/chosen": -2.0405168533325195, "logits/rejected": -2.2970099449157715, "logps/chosen": -0.8726915717124939, "logps/rejected": -0.8815761208534241, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8926374316215515, "rewards/margins": 0.0151824951171875, "rewards/rejected": 0.877454936504364, "step": 3288 }, { "epoch": 1.77, "learning_rate": 6.148159967812831e-08, "logits/chosen": -2.0683507919311523, "logits/rejected": -2.068079710006714, "logps/chosen": -0.6512083411216736, "logps/rejected": -1.829602599143982, "loss": 0.6168, "rewards/accuracies": 1.0, "rewards/chosen": 0.9774102568626404, "rewards/margins": 0.1589025855064392, "rewards/rejected": 0.8185076713562012, "step": 3289 }, { "epoch": 1.77, "learning_rate": 6.146034429805909e-08, "logits/chosen": -2.100435972213745, "logits/rejected": -2.287933349609375, "logps/chosen": -7.642075538635254, "logps/rejected": -0.330623984336853, "loss": 0.6464, "rewards/accuracies": 1.0, "rewards/chosen": 1.0861833095550537, "rewards/margins": 0.09577685594558716, "rewards/rejected": 0.9904064536094666, "step": 3290 }, { "epoch": 1.78, "learning_rate": 6.143908673185333e-08, "logits/chosen": -2.0963127613067627, "logits/rejected": -2.09898042678833, "logps/chosen": -3.4267797470092773, "logps/rejected": -0.8043584823608398, "loss": 0.6201, "rewards/accuracies": 1.0, "rewards/chosen": 1.0004888772964478, "rewards/margins": 0.1519516110420227, "rewards/rejected": 0.848537266254425, "step": 3291 }, { "epoch": 1.78, "learning_rate": 6.141782698356607e-08, "logits/chosen": -2.20927095413208, "logits/rejected": -2.368333578109741, "logps/chosen": -4.400881290435791, "logps/rejected": -0.9254956841468811, "loss": 0.8406, "rewards/accuracies": 0.0, "rewards/chosen": 0.8900697827339172, "rewards/margins": -0.2758665680885315, "rewards/rejected": 1.1659363508224487, "step": 3292 }, { "epoch": 1.78, "learning_rate": 6.139656505725273e-08, "logits/chosen": -2.004409074783325, "logits/rejected": -2.0054843425750732, "logps/chosen": -1.939840316772461, "logps/rejected": -5.841222763061523, "loss": 0.408, "rewards/accuracies": 1.0, "rewards/chosen": 1.1552218198776245, "rewards/margins": 0.6856814026832581, "rewards/rejected": 0.46954041719436646, "step": 3293 }, { "epoch": 1.78, "learning_rate": 6.137530095696915e-08, "logits/chosen": -1.9434161186218262, "logits/rejected": -2.2781357765197754, "logps/chosen": -0.6877213716506958, "logps/rejected": -0.6725907921791077, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.8660759329795837, "rewards/margins": 0.013042211532592773, "rewards/rejected": 0.853033721446991, "step": 3294 }, { "epoch": 1.78, "learning_rate": 6.13540346867716e-08, "logits/chosen": -2.1320087909698486, "logits/rejected": -2.130905866622925, "logps/chosen": -0.5917305946350098, "logps/rejected": -2.506542682647705, "loss": 0.6373, "rewards/accuracies": 1.0, "rewards/chosen": 0.9233919382095337, "rewards/margins": 0.11509555578231812, "rewards/rejected": 0.8082963824272156, "step": 3295 }, { "epoch": 1.78, "learning_rate": 6.13327662507168e-08, "logits/chosen": -2.0340089797973633, "logits/rejected": -2.0305075645446777, "logps/chosen": -1.2071518898010254, "logps/rejected": -7.28474235534668, "loss": 0.4005, "rewards/accuracies": 1.0, "rewards/chosen": 1.012475848197937, "rewards/margins": 0.7081516981124878, "rewards/rejected": 0.3043241500854492, "step": 3296 }, { "epoch": 1.78, "learning_rate": 6.13114956528618e-08, "logits/chosen": -2.182833194732666, "logits/rejected": -2.146585702896118, "logps/chosen": -6.398882865905762, "logps/rejected": -15.94090747833252, "loss": 0.3608, "rewards/accuracies": 1.0, "rewards/chosen": 1.2084832191467285, "rewards/margins": 0.8334585428237915, "rewards/rejected": 0.3750247061252594, "step": 3297 }, { "epoch": 1.78, "learning_rate": 6.129022289726413e-08, "logits/chosen": -2.0862607955932617, "logits/rejected": -2.068833112716675, "logps/chosen": -8.240901947021484, "logps/rejected": -1.7427198886871338, "loss": 0.3888, "rewards/accuracies": 1.0, "rewards/chosen": 1.569421648979187, "rewards/margins": 0.7440751791000366, "rewards/rejected": 0.8253464698791504, "step": 3298 }, { "epoch": 1.78, "learning_rate": 6.12689479879817e-08, "logits/chosen": -2.1210272312164307, "logits/rejected": -2.1192734241485596, "logps/chosen": -2.2529051303863525, "logps/rejected": -1.1860984563827515, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 1.2059487104415894, "rewards/margins": 0.4329957962036133, "rewards/rejected": 0.7729529142379761, "step": 3299 }, { "epoch": 1.78, "learning_rate": 6.124767092907285e-08, "logits/chosen": -2.061901092529297, "logits/rejected": -2.3317697048187256, "logps/chosen": -0.31794264912605286, "logps/rejected": -0.3688424825668335, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.8992270827293396, "rewards/margins": 0.02056831121444702, "rewards/rejected": 0.8786587715148926, "step": 3300 }, { "epoch": 1.78, "learning_rate": 6.122639172459632e-08, "logits/chosen": -2.1006767749786377, "logits/rejected": -2.0977401733398438, "logps/chosen": -2.517097234725952, "logps/rejected": -5.241234302520752, "loss": 0.3921, "rewards/accuracies": 1.0, "rewards/chosen": 1.3065317869186401, "rewards/margins": 0.7339152693748474, "rewards/rejected": 0.5726165175437927, "step": 3301 }, { "epoch": 1.78, "learning_rate": 6.120511037861125e-08, "logits/chosen": -2.0814404487609863, "logits/rejected": -2.3082046508789062, "logps/chosen": -0.6716490387916565, "logps/rejected": -0.649625837802887, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.9625796675682068, "rewards/margins": 0.021354317665100098, "rewards/rejected": 0.9412253499031067, "step": 3302 }, { "epoch": 1.78, "learning_rate": 6.118382689517719e-08, "logits/chosen": -2.0390512943267822, "logits/rejected": -2.2888667583465576, "logps/chosen": -0.2719242572784424, "logps/rejected": -0.28775203227996826, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 1.0344138145446777, "rewards/margins": 0.0014214515686035156, "rewards/rejected": 1.0329923629760742, "step": 3303 }, { "epoch": 1.78, "learning_rate": 6.116254127835413e-08, "logits/chosen": -1.9920445680618286, "logits/rejected": -1.9899910688400269, "logps/chosen": -0.5890056490898132, "logps/rejected": -7.928160667419434, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9218605160713196, "rewards/margins": 0.22158515453338623, "rewards/rejected": 0.7002753615379333, "step": 3304 }, { "epoch": 1.78, "learning_rate": 6.114125353220243e-08, "logits/chosen": -2.091224431991577, "logits/rejected": -2.1004741191864014, "logps/chosen": -0.9957432746887207, "logps/rejected": -11.316217422485352, "loss": 0.5094, "rewards/accuracies": 1.0, "rewards/chosen": 0.9425039291381836, "rewards/margins": 0.4090539813041687, "rewards/rejected": 0.5334499478340149, "step": 3305 }, { "epoch": 1.78, "learning_rate": 6.111996366078286e-08, "logits/chosen": -2.064068078994751, "logits/rejected": -2.2896716594696045, "logps/chosen": -0.8873221278190613, "logps/rejected": -0.9230695366859436, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.9518739581108093, "rewards/margins": 0.029473304748535156, "rewards/rejected": 0.9224006533622742, "step": 3306 }, { "epoch": 1.78, "learning_rate": 6.10986716681566e-08, "logits/chosen": -2.172964096069336, "logits/rejected": -2.181356191635132, "logps/chosen": -2.011850118637085, "logps/rejected": -2.4549431800842285, "loss": 0.507, "rewards/accuracies": 1.0, "rewards/chosen": 1.1175990104675293, "rewards/margins": 0.4149983525276184, "rewards/rejected": 0.7026006579399109, "step": 3307 }, { "epoch": 1.78, "learning_rate": 6.107737755838527e-08, "logits/chosen": -2.1424450874328613, "logits/rejected": -2.142612934112549, "logps/chosen": -1.076519250869751, "logps/rejected": -1.9037432670593262, "loss": 0.598, "rewards/accuracies": 1.0, "rewards/chosen": 1.063205599784851, "rewards/margins": 0.20024532079696655, "rewards/rejected": 0.8629602789878845, "step": 3308 }, { "epoch": 1.78, "learning_rate": 6.105608133553084e-08, "logits/chosen": -2.1444900035858154, "logits/rejected": -2.1465156078338623, "logps/chosen": -1.487696886062622, "logps/rejected": -11.508934020996094, "loss": 0.3608, "rewards/accuracies": 1.0, "rewards/chosen": 1.1618398427963257, "rewards/margins": 0.8337350487709045, "rewards/rejected": 0.32810479402542114, "step": 3309 }, { "epoch": 1.79, "learning_rate": 6.10347830036557e-08, "logits/chosen": -2.1311700344085693, "logits/rejected": -2.31168794631958, "logps/chosen": -5.173255920410156, "logps/rejected": -10.9009370803833, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9640888571739197, "rewards/margins": 0.021711111068725586, "rewards/rejected": 0.9423777461051941, "step": 3310 }, { "epoch": 1.79, "learning_rate": 6.101348256682266e-08, "logits/chosen": -2.025819778442383, "logits/rejected": -2.2463924884796143, "logps/chosen": -8.663070678710938, "logps/rejected": -5.462612628936768, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.8923794031143188, "rewards/margins": 0.03241044282913208, "rewards/rejected": 0.8599689602851868, "step": 3311 }, { "epoch": 1.79, "learning_rate": 6.099218002909489e-08, "logits/chosen": -2.181246519088745, "logits/rejected": -2.3308353424072266, "logps/chosen": -6.737862586975098, "logps/rejected": -0.7791667580604553, "loss": 0.7103, "rewards/accuracies": 0.0, "rewards/chosen": 1.0523335933685303, "rewards/margins": -0.03401815891265869, "rewards/rejected": 1.086351752281189, "step": 3312 }, { "epoch": 1.79, "learning_rate": 6.097087539453604e-08, "logits/chosen": -2.209111213684082, "logits/rejected": -2.2409751415252686, "logps/chosen": -6.171547889709473, "logps/rejected": -24.01820945739746, "loss": 0.7805, "rewards/accuracies": 0.0, "rewards/chosen": 1.022367000579834, "rewards/margins": -0.16768109798431396, "rewards/rejected": 1.190048098564148, "step": 3313 }, { "epoch": 1.79, "learning_rate": 6.094956866721008e-08, "logits/chosen": -2.1570448875427246, "logits/rejected": -2.253098726272583, "logps/chosen": -5.440056800842285, "logps/rejected": -8.413604736328125, "loss": 0.6025, "rewards/accuracies": 1.0, "rewards/chosen": 0.8832583427429199, "rewards/margins": 0.1903747320175171, "rewards/rejected": 0.6928836107254028, "step": 3314 }, { "epoch": 1.79, "learning_rate": 6.092825985118142e-08, "logits/chosen": -2.0534985065460205, "logits/rejected": -2.2853751182556152, "logps/chosen": -1.30758535861969, "logps/rejected": -1.2821556329727173, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.826665997505188, "rewards/margins": -0.005033671855926514, "rewards/rejected": 0.8316996693611145, "step": 3315 }, { "epoch": 1.79, "learning_rate": 6.090694895051486e-08, "logits/chosen": -2.0974349975585938, "logits/rejected": -2.2941558361053467, "logps/chosen": -0.3909311890602112, "logps/rejected": -0.41588735580444336, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 0.9553036689758301, "rewards/margins": 0.030920684337615967, "rewards/rejected": 0.9243829846382141, "step": 3316 }, { "epoch": 1.79, "learning_rate": 6.088563596927557e-08, "logits/chosen": -2.0908870697021484, "logits/rejected": -2.261077404022217, "logps/chosen": -0.11937520653009415, "logps/rejected": -0.12693814933300018, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9344097375869751, "rewards/margins": 0.018036186695098877, "rewards/rejected": 0.9163735508918762, "step": 3317 }, { "epoch": 1.79, "learning_rate": 6.086432091152915e-08, "logits/chosen": -2.125861644744873, "logits/rejected": -2.1283609867095947, "logps/chosen": -0.6635029911994934, "logps/rejected": -3.3960652351379395, "loss": 0.514, "rewards/accuracies": 1.0, "rewards/chosen": 1.1347764730453491, "rewards/margins": 0.3976295590400696, "rewards/rejected": 0.7371469140052795, "step": 3318 }, { "epoch": 1.79, "learning_rate": 6.084300378134162e-08, "logits/chosen": -2.028172492980957, "logits/rejected": -2.0253546237945557, "logps/chosen": -0.8438293933868408, "logps/rejected": -2.5758471488952637, "loss": 0.6324, "rewards/accuracies": 1.0, "rewards/chosen": 0.9092168211936951, "rewards/margins": 0.12538528442382812, "rewards/rejected": 0.7838315367698669, "step": 3319 }, { "epoch": 1.79, "learning_rate": 6.082168458277933e-08, "logits/chosen": -1.9553258419036865, "logits/rejected": -1.9516476392745972, "logps/chosen": -4.791092395782471, "logps/rejected": -4.5268330574035645, "loss": 0.3427, "rewards/accuracies": 1.0, "rewards/chosen": 1.4179718494415283, "rewards/margins": 0.8947775959968567, "rewards/rejected": 0.5231942534446716, "step": 3320 }, { "epoch": 1.79, "learning_rate": 6.080036331990908e-08, "logits/chosen": -1.9954133033752441, "logits/rejected": -2.297028064727783, "logps/chosen": -9.395713806152344, "logps/rejected": -7.242377281188965, "loss": 0.7498, "rewards/accuracies": 0.0, "rewards/chosen": 0.9012615084648132, "rewards/margins": -0.11019796133041382, "rewards/rejected": 1.011459469795227, "step": 3321 }, { "epoch": 1.79, "learning_rate": 6.077903999679802e-08, "logits/chosen": -2.1207079887390137, "logits/rejected": -2.3420753479003906, "logps/chosen": -4.216137886047363, "logps/rejected": -3.8781485557556152, "loss": 0.7015, "rewards/accuracies": 0.0, "rewards/chosen": 0.9685193300247192, "rewards/margins": -0.016647040843963623, "rewards/rejected": 0.9851663708686829, "step": 3322 }, { "epoch": 1.79, "learning_rate": 6.075771461751373e-08, "logits/chosen": -2.0288264751434326, "logits/rejected": -2.02638840675354, "logps/chosen": -0.26727598905563354, "logps/rejected": -4.141547203063965, "loss": 0.4758, "rewards/accuracies": 1.0, "rewards/chosen": 0.9424058198928833, "rewards/margins": 0.4954294264316559, "rewards/rejected": 0.4469763934612274, "step": 3323 }, { "epoch": 1.79, "learning_rate": 6.073638718612416e-08, "logits/chosen": -2.018179416656494, "logits/rejected": -1.9895423650741577, "logps/chosen": -8.47492790222168, "logps/rejected": -5.446889877319336, "loss": 0.4159, "rewards/accuracies": 1.0, "rewards/chosen": 1.4122730493545532, "rewards/margins": 0.6621381640434265, "rewards/rejected": 0.7501348853111267, "step": 3324 }, { "epoch": 1.79, "learning_rate": 6.071505770669767e-08, "logits/chosen": -1.9674381017684937, "logits/rejected": -2.26810884475708, "logps/chosen": -0.18761052191257477, "logps/rejected": -0.17682567238807678, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.946901261806488, "rewards/margins": 0.025566399097442627, "rewards/rejected": 0.9213348627090454, "step": 3325 }, { "epoch": 1.79, "learning_rate": 6.0693726183303e-08, "logits/chosen": -2.0652008056640625, "logits/rejected": -2.0676863193511963, "logps/chosen": -4.69952392578125, "logps/rejected": -1.8177943229675293, "loss": 0.4859, "rewards/accuracies": 1.0, "rewards/chosen": 1.3916606903076172, "rewards/margins": 0.46892356872558594, "rewards/rejected": 0.9227371215820312, "step": 3326 }, { "epoch": 1.79, "learning_rate": 6.067239262000925e-08, "logits/chosen": -2.0877344608306885, "logits/rejected": -2.2613916397094727, "logps/chosen": -26.606164932250977, "logps/rejected": -27.60515594482422, "loss": 0.7216, "rewards/accuracies": 0.0, "rewards/chosen": 0.7519567608833313, "rewards/margins": -0.056060969829559326, "rewards/rejected": 0.8080177307128906, "step": 3327 }, { "epoch": 1.8, "learning_rate": 6.065105702088598e-08, "logits/chosen": -2.0547804832458496, "logits/rejected": -2.06612491607666, "logps/chosen": -6.226049423217773, "logps/rejected": -2.1995081901550293, "loss": 0.472, "rewards/accuracies": 1.0, "rewards/chosen": 1.2767454385757446, "rewards/margins": 0.5054817795753479, "rewards/rejected": 0.7712636590003967, "step": 3328 }, { "epoch": 1.8, "learning_rate": 6.062971939000305e-08, "logits/chosen": -1.949069857597351, "logits/rejected": -2.258265495300293, "logps/chosen": -0.7413734197616577, "logps/rejected": -0.9125319719314575, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.8958630561828613, "rewards/margins": -0.00449526309967041, "rewards/rejected": 0.9003583192825317, "step": 3329 }, { "epoch": 1.8, "learning_rate": 6.06083797314308e-08, "logits/chosen": -2.0471389293670654, "logits/rejected": -2.036689043045044, "logps/chosen": -3.766618490219116, "logps/rejected": -7.495491027832031, "loss": 0.2726, "rewards/accuracies": 1.0, "rewards/chosen": 1.5880450010299683, "rewards/margins": 1.16041898727417, "rewards/rejected": 0.4276260435581207, "step": 3330 }, { "epoch": 1.8, "learning_rate": 6.058703804923991e-08, "logits/chosen": -2.2896790504455566, "logits/rejected": -2.13236403465271, "logps/chosen": -27.483680725097656, "logps/rejected": -3.5163984298706055, "loss": 0.1691, "rewards/accuracies": 1.0, "rewards/chosen": 2.2513070106506348, "rewards/margins": 1.6918100118637085, "rewards/rejected": 0.5594969987869263, "step": 3331 }, { "epoch": 1.8, "learning_rate": 6.056569434750141e-08, "logits/chosen": -2.1188557147979736, "logits/rejected": -2.305635690689087, "logps/chosen": -0.18609808385372162, "logps/rejected": -0.20200665295124054, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.8414495587348938, "rewards/margins": 0.030785858631134033, "rewards/rejected": 0.8106637001037598, "step": 3332 }, { "epoch": 1.8, "learning_rate": 6.054434863028677e-08, "logits/chosen": -1.9968987703323364, "logits/rejected": -2.2562143802642822, "logps/chosen": -3.263840675354004, "logps/rejected": -0.4323059320449829, "loss": 0.7399, "rewards/accuracies": 0.0, "rewards/chosen": 0.8782785534858704, "rewards/margins": -0.09139108657836914, "rewards/rejected": 0.9696696400642395, "step": 3333 }, { "epoch": 1.8, "learning_rate": 6.052300090166782e-08, "logits/chosen": -2.057539463043213, "logits/rejected": -2.0662944316864014, "logps/chosen": -1.209242820739746, "logps/rejected": -2.2065601348876953, "loss": 0.4956, "rewards/accuracies": 1.0, "rewards/chosen": 1.0657018423080444, "rewards/margins": 0.44389092922210693, "rewards/rejected": 0.6218109130859375, "step": 3334 }, { "epoch": 1.8, "learning_rate": 6.05016511657168e-08, "logits/chosen": -2.1073663234710693, "logits/rejected": -2.093579053878784, "logps/chosen": -14.303585052490234, "logps/rejected": -5.800752639770508, "loss": 0.2449, "rewards/accuracies": 1.0, "rewards/chosen": 1.7361341714859009, "rewards/margins": 1.2819724082946777, "rewards/rejected": 0.45416173338890076, "step": 3335 }, { "epoch": 1.8, "learning_rate": 6.04802994265063e-08, "logits/chosen": -2.113832950592041, "logits/rejected": -2.1111583709716797, "logps/chosen": -1.4293417930603027, "logps/rejected": -2.3172059059143066, "loss": 0.6367, "rewards/accuracies": 1.0, "rewards/chosen": 1.1318515539169312, "rewards/margins": 0.11617791652679443, "rewards/rejected": 1.0156736373901367, "step": 3336 }, { "epoch": 1.8, "learning_rate": 6.045894568810931e-08, "logits/chosen": -1.9946202039718628, "logits/rejected": -2.257526159286499, "logps/chosen": -0.3304092586040497, "logps/rejected": -0.40652942657470703, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.8896445631980896, "rewards/margins": 0.013799130916595459, "rewards/rejected": 0.8758454322814941, "step": 3337 }, { "epoch": 1.8, "learning_rate": 6.043758995459919e-08, "logits/chosen": -2.1586999893188477, "logits/rejected": -2.137939214706421, "logps/chosen": -8.266504287719727, "logps/rejected": -2.11824893951416, "loss": 0.2716, "rewards/accuracies": 1.0, "rewards/chosen": 1.8793489933013916, "rewards/margins": 1.1645491123199463, "rewards/rejected": 0.7147998809814453, "step": 3338 }, { "epoch": 1.8, "learning_rate": 6.04162322300497e-08, "logits/chosen": -2.0456302165985107, "logits/rejected": -2.2637224197387695, "logps/chosen": -1.0523369312286377, "logps/rejected": -1.008926272392273, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.9955949783325195, "rewards/margins": -0.005007028579711914, "rewards/rejected": 1.0006020069122314, "step": 3339 }, { "epoch": 1.8, "learning_rate": 6.039487251853495e-08, "logits/chosen": -2.135098457336426, "logits/rejected": -2.130624532699585, "logps/chosen": -4.126895427703857, "logps/rejected": -5.056941986083984, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": 1.4260419607162476, "rewards/margins": 0.8918699622154236, "rewards/rejected": 0.534171998500824, "step": 3340 }, { "epoch": 1.8, "learning_rate": 6.037351082412947e-08, "logits/chosen": -1.997061848640442, "logits/rejected": -1.998456358909607, "logps/chosen": -5.908972263336182, "logps/rejected": -1.1638367176055908, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 1.2500345706939697, "rewards/margins": 0.6405768990516663, "rewards/rejected": 0.6094576716423035, "step": 3341 }, { "epoch": 1.8, "learning_rate": 6.035214715090813e-08, "logits/chosen": -2.2265729904174805, "logits/rejected": -2.31709361076355, "logps/chosen": -14.516983985900879, "logps/rejected": -10.113363265991211, "loss": 0.6671, "rewards/accuracies": 1.0, "rewards/chosen": 0.896842896938324, "rewards/margins": 0.052835941314697266, "rewards/rejected": 0.8440069556236267, "step": 3342 }, { "epoch": 1.8, "learning_rate": 6.033078150294618e-08, "logits/chosen": -2.133592367172241, "logits/rejected": -2.1321635246276855, "logps/chosen": -9.165778160095215, "logps/rejected": -1.781740427017212, "loss": 0.5073, "rewards/accuracies": 1.0, "rewards/chosen": 1.3206547498703003, "rewards/margins": 0.41430193185806274, "rewards/rejected": 0.9063528180122375, "step": 3343 }, { "epoch": 1.8, "learning_rate": 6.030941388431929e-08, "logits/chosen": -2.0061018466949463, "logits/rejected": -2.0051612854003906, "logps/chosen": -5.794106960296631, "logps/rejected": -15.903753280639648, "loss": 0.4479, "rewards/accuracies": 1.0, "rewards/chosen": 0.872963547706604, "rewards/margins": 0.5710182189941406, "rewards/rejected": 0.301945298910141, "step": 3344 }, { "epoch": 1.8, "learning_rate": 6.028804429910343e-08, "logits/chosen": -2.1020760536193848, "logits/rejected": -2.1051862239837646, "logps/chosen": -2.8931331634521484, "logps/rejected": -3.135183811187744, "loss": 0.5721, "rewards/accuracies": 1.0, "rewards/chosen": 1.130218505859375, "rewards/margins": 0.25883978605270386, "rewards/rejected": 0.8713787198066711, "step": 3345 }, { "epoch": 1.8, "learning_rate": 6.026667275137505e-08, "logits/chosen": -1.9553123712539673, "logits/rejected": -2.259705066680908, "logps/chosen": -0.28891390562057495, "logps/rejected": -0.32110920548439026, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9138333201408386, "rewards/margins": 0.004699289798736572, "rewards/rejected": 0.909134030342102, "step": 3346 }, { "epoch": 1.81, "learning_rate": 6.024529924521086e-08, "logits/chosen": -2.081603765487671, "logits/rejected": -2.073606014251709, "logps/chosen": -12.718097686767578, "logps/rejected": -5.811551570892334, "loss": 0.494, "rewards/accuracies": 1.0, "rewards/chosen": 1.1414257287979126, "rewards/margins": 0.44809091091156006, "rewards/rejected": 0.6933348178863525, "step": 3347 }, { "epoch": 1.81, "learning_rate": 6.022392378468801e-08, "logits/chosen": -2.095837116241455, "logits/rejected": -2.1022300720214844, "logps/chosen": -1.357338786125183, "logps/rejected": -2.4774272441864014, "loss": 0.4917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0935124158859253, "rewards/margins": 0.4539429545402527, "rewards/rejected": 0.6395694613456726, "step": 3348 }, { "epoch": 1.81, "learning_rate": 6.020254637388403e-08, "logits/chosen": -2.103649854660034, "logits/rejected": -2.10888671875, "logps/chosen": -2.4606008529663086, "logps/rejected": -6.113727569580078, "loss": 0.356, "rewards/accuracies": 1.0, "rewards/chosen": 1.2305283546447754, "rewards/margins": 0.8494337797164917, "rewards/rejected": 0.3810945451259613, "step": 3349 }, { "epoch": 1.81, "learning_rate": 6.01811670168768e-08, "logits/chosen": -2.1371023654937744, "logits/rejected": -2.05008602142334, "logps/chosen": -14.140198707580566, "logps/rejected": -5.160446643829346, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 1.72148859500885, "rewards/margins": 1.2624871730804443, "rewards/rejected": 0.4590013921260834, "step": 3350 }, { "epoch": 1.81, "learning_rate": 6.015978571774456e-08, "logits/chosen": -2.078827142715454, "logits/rejected": -2.077876329421997, "logps/chosen": -1.3798837661743164, "logps/rejected": -3.4895567893981934, "loss": 0.5707, "rewards/accuracies": 1.0, "rewards/chosen": 0.8191068768501282, "rewards/margins": 0.2619079351425171, "rewards/rejected": 0.5571989417076111, "step": 3351 }, { "epoch": 1.81, "learning_rate": 6.013840248056592e-08, "logits/chosen": -2.1246731281280518, "logits/rejected": -2.1328542232513428, "logps/chosen": -1.708097219467163, "logps/rejected": -2.504957675933838, "loss": 0.4167, "rewards/accuracies": 1.0, "rewards/chosen": 1.4723268747329712, "rewards/margins": 0.6598472595214844, "rewards/rejected": 0.8124796152114868, "step": 3352 }, { "epoch": 1.81, "learning_rate": 6.011701730941992e-08, "logits/chosen": -2.018805980682373, "logits/rejected": -2.3446011543273926, "logps/chosen": -1.5882296562194824, "logps/rejected": -1.7899242639541626, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 1.1077817678451538, "rewards/margins": -0.008407831192016602, "rewards/rejected": 1.1161895990371704, "step": 3353 }, { "epoch": 1.81, "learning_rate": 6.00956302083859e-08, "logits/chosen": -1.9989861249923706, "logits/rejected": -1.9866830110549927, "logps/chosen": -0.9888342022895813, "logps/rejected": -3.261584997177124, "loss": 0.5479, "rewards/accuracies": 1.0, "rewards/chosen": 1.009924292564392, "rewards/margins": 0.31534236669540405, "rewards/rejected": 0.694581925868988, "step": 3354 }, { "epoch": 1.81, "learning_rate": 6.007424118154358e-08, "logits/chosen": -2.0204436779022217, "logits/rejected": -2.023789882659912, "logps/chosen": -2.7382540702819824, "logps/rejected": -0.7741727232933044, "loss": 0.6247, "rewards/accuracies": 1.0, "rewards/chosen": 1.1440380811691284, "rewards/margins": 0.14194679260253906, "rewards/rejected": 1.0020912885665894, "step": 3355 }, { "epoch": 1.81, "learning_rate": 6.005285023297305e-08, "logits/chosen": -2.143202066421509, "logits/rejected": -2.317451238632202, "logps/chosen": -0.3950868546962738, "logps/rejected": -0.38092559576034546, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.8619166612625122, "rewards/margins": 0.016810715198516846, "rewards/rejected": 0.8451059460639954, "step": 3356 }, { "epoch": 1.81, "learning_rate": 6.003145736675479e-08, "logits/chosen": -2.013988971710205, "logits/rejected": -2.2500104904174805, "logps/chosen": -1.1320277452468872, "logps/rejected": -1.2405716180801392, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.6612312197685242, "rewards/margins": 0.011747360229492188, "rewards/rejected": 0.649483859539032, "step": 3357 }, { "epoch": 1.81, "learning_rate": 6.001006258696964e-08, "logits/chosen": -2.199424982070923, "logits/rejected": -2.207524299621582, "logps/chosen": -2.692460060119629, "logps/rejected": -5.956344127655029, "loss": 0.2951, "rewards/accuracies": 1.0, "rewards/chosen": 1.346366286277771, "rewards/margins": 1.0691821575164795, "rewards/rejected": 0.2771841585636139, "step": 3358 }, { "epoch": 1.81, "learning_rate": 5.99886658976988e-08, "logits/chosen": -2.079144239425659, "logits/rejected": -1.9794842004776, "logps/chosen": -23.668989181518555, "logps/rejected": -2.389572858810425, "loss": 0.1712, "rewards/accuracies": 1.0, "rewards/chosen": 2.1940343379974365, "rewards/margins": 1.6783437728881836, "rewards/rejected": 0.5156905055046082, "step": 3359 }, { "epoch": 1.81, "learning_rate": 5.996726730302381e-08, "logits/chosen": -2.05871844291687, "logits/rejected": -2.0526654720306396, "logps/chosen": -4.397846698760986, "logps/rejected": -0.9908493757247925, "loss": 0.2427, "rewards/accuracies": 1.0, "rewards/chosen": 2.194453001022339, "rewards/margins": 1.2921158075332642, "rewards/rejected": 0.9023371934890747, "step": 3360 }, { "epoch": 1.81, "learning_rate": 5.994586680702662e-08, "logits/chosen": -2.0899288654327393, "logits/rejected": -2.1086008548736572, "logps/chosen": -2.144115447998047, "logps/rejected": -6.972699165344238, "loss": 0.5499, "rewards/accuracies": 1.0, "rewards/chosen": 1.0745295286178589, "rewards/margins": 0.31051039695739746, "rewards/rejected": 0.7640191316604614, "step": 3361 }, { "epoch": 1.81, "learning_rate": 5.992446441378947e-08, "logits/chosen": -2.128842353820801, "logits/rejected": -2.1347224712371826, "logps/chosen": -1.6059167385101318, "logps/rejected": -1.8139382600784302, "loss": 0.4495, "rewards/accuracies": 1.0, "rewards/chosen": 1.178992509841919, "rewards/margins": 0.5665584802627563, "rewards/rejected": 0.6124340295791626, "step": 3362 }, { "epoch": 1.81, "learning_rate": 5.990306012739505e-08, "logits/chosen": -1.980847954750061, "logits/rejected": -2.227081060409546, "logps/chosen": -1.2683305740356445, "logps/rejected": -1.0409259796142578, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.793370246887207, "rewards/margins": 0.004296839237213135, "rewards/rejected": 0.7890734076499939, "step": 3363 }, { "epoch": 1.81, "learning_rate": 5.988165395192635e-08, "logits/chosen": -2.1727044582366943, "logits/rejected": -2.326209783554077, "logps/chosen": -1.8043477535247803, "logps/rejected": -1.2931102514266968, "loss": 0.7589, "rewards/accuracies": 0.0, "rewards/chosen": 0.898885190486908, "rewards/margins": -0.12738364934921265, "rewards/rejected": 1.0262688398361206, "step": 3364 }, { "epoch": 1.81, "learning_rate": 5.986024589146676e-08, "logits/chosen": -2.0445046424865723, "logits/rejected": -2.0443942546844482, "logps/chosen": -0.6274410486221313, "logps/rejected": -6.636934280395508, "loss": 0.3931, "rewards/accuracies": 1.0, "rewards/chosen": 1.1218066215515137, "rewards/margins": 0.7307436466217041, "rewards/rejected": 0.3910629451274872, "step": 3365 }, { "epoch": 1.82, "learning_rate": 5.983883595009998e-08, "logits/chosen": -2.0214176177978516, "logits/rejected": -2.022904634475708, "logps/chosen": -1.1376168727874756, "logps/rejected": -2.735743999481201, "loss": 0.5201, "rewards/accuracies": 1.0, "rewards/chosen": 1.1141924858093262, "rewards/margins": 0.3825168013572693, "rewards/rejected": 0.7316756844520569, "step": 3366 }, { "epoch": 1.82, "learning_rate": 5.981742413191011e-08, "logits/chosen": -2.1006767749786377, "logits/rejected": -2.099945068359375, "logps/chosen": -2.950847625732422, "logps/rejected": -4.448599338531494, "loss": 0.3459, "rewards/accuracies": 1.0, "rewards/chosen": 1.4227272272109985, "rewards/margins": 0.883553683757782, "rewards/rejected": 0.5391735434532166, "step": 3367 }, { "epoch": 1.82, "learning_rate": 5.979601044098159e-08, "logits/chosen": -2.0224251747131348, "logits/rejected": -2.023550271987915, "logps/chosen": -1.9527957439422607, "logps/rejected": -4.860841274261475, "loss": 0.3264, "rewards/accuracies": 1.0, "rewards/chosen": 1.5197787284851074, "rewards/margins": 0.952084481716156, "rewards/rejected": 0.5676942467689514, "step": 3368 }, { "epoch": 1.82, "learning_rate": 5.977459488139927e-08, "logits/chosen": -2.132936716079712, "logits/rejected": -2.134627103805542, "logps/chosen": -0.5751396417617798, "logps/rejected": -7.385802268981934, "loss": 0.4309, "rewards/accuracies": 1.0, "rewards/chosen": 0.9535273909568787, "rewards/margins": 0.618671178817749, "rewards/rejected": 0.334856241941452, "step": 3369 }, { "epoch": 1.82, "learning_rate": 5.975317745724824e-08, "logits/chosen": -2.0873825550079346, "logits/rejected": -2.257117748260498, "logps/chosen": -3.7529897689819336, "logps/rejected": -1.2405064105987549, "loss": 0.7286, "rewards/accuracies": 0.0, "rewards/chosen": 0.6673445701599121, "rewards/margins": -0.06973153352737427, "rewards/rejected": 0.7370761036872864, "step": 3370 }, { "epoch": 1.82, "learning_rate": 5.973175817261406e-08, "logits/chosen": -2.048567295074463, "logits/rejected": -2.047271251678467, "logps/chosen": -7.668753147125244, "logps/rejected": -2.497105121612549, "loss": 0.4171, "rewards/accuracies": 1.0, "rewards/chosen": 1.3849416971206665, "rewards/margins": 0.6587268710136414, "rewards/rejected": 0.7262148261070251, "step": 3371 }, { "epoch": 1.82, "learning_rate": 5.971033703158258e-08, "logits/chosen": -2.0471200942993164, "logits/rejected": -2.329984188079834, "logps/chosen": -0.43958377838134766, "logps/rejected": -2.974030017852783, "loss": 0.5348, "rewards/accuracies": 1.0, "rewards/chosen": 0.9439670443534851, "rewards/margins": 0.3466312885284424, "rewards/rejected": 0.5973357558250427, "step": 3372 }, { "epoch": 1.82, "learning_rate": 5.968891403824004e-08, "logits/chosen": -2.1053435802459717, "logits/rejected": -2.0912058353424072, "logps/chosen": -16.600128173828125, "logps/rejected": -3.878504753112793, "loss": 0.2708, "rewards/accuracies": 1.0, "rewards/chosen": 1.58562433719635, "rewards/margins": 1.167869210243225, "rewards/rejected": 0.417755126953125, "step": 3373 }, { "epoch": 1.82, "learning_rate": 5.966748919667299e-08, "logits/chosen": -2.036802291870117, "logits/rejected": -2.312825918197632, "logps/chosen": -0.6578810811042786, "logps/rejected": -11.22867202758789, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 0.9205427169799805, "rewards/margins": 0.035692036151885986, "rewards/rejected": 0.8848506808280945, "step": 3374 }, { "epoch": 1.82, "learning_rate": 5.964606251096838e-08, "logits/chosen": -2.06241774559021, "logits/rejected": -2.061891794204712, "logps/chosen": -0.7268264889717102, "logps/rejected": -6.854896545410156, "loss": 0.4122, "rewards/accuracies": 1.0, "rewards/chosen": 1.1177223920822144, "rewards/margins": 0.6730639934539795, "rewards/rejected": 0.4446583688259125, "step": 3375 }, { "epoch": 1.82, "learning_rate": 5.962463398521351e-08, "logits/chosen": -2.174374580383301, "logits/rejected": -2.177668333053589, "logps/chosen": -2.2087414264678955, "logps/rejected": -3.965367317199707, "loss": 0.5, "rewards/accuracies": 1.0, "rewards/chosen": 1.122530460357666, "rewards/margins": 0.43285757303237915, "rewards/rejected": 0.6896728873252869, "step": 3376 }, { "epoch": 1.82, "learning_rate": 5.960320362349601e-08, "logits/chosen": -2.120344400405884, "logits/rejected": -2.154388427734375, "logps/chosen": -1.9884312152862549, "logps/rejected": -6.023921966552734, "loss": 0.6274, "rewards/accuracies": 1.0, "rewards/chosen": 1.0887093544006348, "rewards/margins": 0.13605374097824097, "rewards/rejected": 0.9526556134223938, "step": 3377 }, { "epoch": 1.82, "learning_rate": 5.9581771429903824e-08, "logits/chosen": -2.1283352375030518, "logits/rejected": -2.305983066558838, "logps/chosen": -1.214629888534546, "logps/rejected": -1.1839728355407715, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 0.8236656188964844, "rewards/margins": -0.014209926128387451, "rewards/rejected": 0.8378755450248718, "step": 3378 }, { "epoch": 1.82, "learning_rate": 5.956033740852532e-08, "logits/chosen": -2.0784411430358887, "logits/rejected": -2.3170363903045654, "logps/chosen": -3.3949358463287354, "logps/rejected": -3.730834484100342, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.7612723708152771, "rewards/margins": 0.012413263320922852, "rewards/rejected": 0.7488591074943542, "step": 3379 }, { "epoch": 1.82, "learning_rate": 5.953890156344916e-08, "logits/chosen": -2.0056817531585693, "logits/rejected": -2.313979148864746, "logps/chosen": -0.5005435943603516, "logps/rejected": -0.5864243507385254, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.995373547077179, "rewards/margins": 0.026900887489318848, "rewards/rejected": 0.9684726595878601, "step": 3380 }, { "epoch": 1.82, "learning_rate": 5.951746389876439e-08, "logits/chosen": -2.038123846054077, "logits/rejected": -2.0476646423339844, "logps/chosen": -1.5480631589889526, "logps/rejected": -2.575869560241699, "loss": 0.4338, "rewards/accuracies": 1.0, "rewards/chosen": 1.2217445373535156, "rewards/margins": 0.6104026436805725, "rewards/rejected": 0.6113418936729431, "step": 3381 }, { "epoch": 1.82, "learning_rate": 5.9496024418560365e-08, "logits/chosen": -1.9669727087020874, "logits/rejected": -1.964817762374878, "logps/chosen": -1.4439353942871094, "logps/rejected": -3.6765637397766113, "loss": 0.6573, "rewards/accuracies": 1.0, "rewards/chosen": 0.8829568028450012, "rewards/margins": 0.07292568683624268, "rewards/rejected": 0.8100311160087585, "step": 3382 }, { "epoch": 1.82, "learning_rate": 5.947458312692684e-08, "logits/chosen": -2.0217812061309814, "logits/rejected": -2.2916672229766846, "logps/chosen": -2.1997227668762207, "logps/rejected": -6.097021102905273, "loss": 0.6537, "rewards/accuracies": 1.0, "rewards/chosen": 0.9185394644737244, "rewards/margins": 0.08051437139511108, "rewards/rejected": 0.8380250930786133, "step": 3383 }, { "epoch": 1.83, "learning_rate": 5.945314002795385e-08, "logits/chosen": -2.0319125652313232, "logits/rejected": -2.3120927810668945, "logps/chosen": -0.24660618603229523, "logps/rejected": -0.2874084711074829, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.7588984370231628, "rewards/margins": -0.005914747714996338, "rewards/rejected": 0.7648131847381592, "step": 3384 }, { "epoch": 1.83, "learning_rate": 5.943169512573183e-08, "logits/chosen": -2.150545120239258, "logits/rejected": -2.2445549964904785, "logps/chosen": -0.26974332332611084, "logps/rejected": -0.26219668984413147, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8737732172012329, "rewards/margins": 0.01090383529663086, "rewards/rejected": 0.862869381904602, "step": 3385 }, { "epoch": 1.83, "learning_rate": 5.9410248424351536e-08, "logits/chosen": -2.0372328758239746, "logits/rejected": -2.2157106399536133, "logps/chosen": -2.749406099319458, "logps/rejected": -2.723125457763672, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.5359784960746765, "rewards/margins": 0.026028931140899658, "rewards/rejected": 0.5099495649337769, "step": 3386 }, { "epoch": 1.83, "learning_rate": 5.9388799927904055e-08, "logits/chosen": -2.0908892154693604, "logits/rejected": -2.08890700340271, "logps/chosen": -3.3331005573272705, "logps/rejected": -2.559800624847412, "loss": 0.5189, "rewards/accuracies": 1.0, "rewards/chosen": 1.1429522037506104, "rewards/margins": 0.3854435682296753, "rewards/rejected": 0.7575086355209351, "step": 3387 }, { "epoch": 1.83, "learning_rate": 5.936734964048083e-08, "logits/chosen": -1.9440556764602661, "logits/rejected": -2.274951219558716, "logps/chosen": -0.7055362462997437, "logps/rejected": -0.8170264363288879, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": 1.0056397914886475, "rewards/margins": 0.06870990991592407, "rewards/rejected": 0.9369298815727234, "step": 3388 }, { "epoch": 1.83, "learning_rate": 5.934589756617366e-08, "logits/chosen": -2.1301207542419434, "logits/rejected": -2.3575282096862793, "logps/chosen": -1.4818456172943115, "logps/rejected": -0.9605387449264526, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": 0.9708072543144226, "rewards/margins": -0.012969255447387695, "rewards/rejected": 0.9837765097618103, "step": 3389 }, { "epoch": 1.83, "learning_rate": 5.932444370907467e-08, "logits/chosen": -2.255099296569824, "logits/rejected": -2.174309015274048, "logps/chosen": -23.88195037841797, "logps/rejected": -4.015477657318115, "loss": 0.2073, "rewards/accuracies": 1.0, "rewards/chosen": 2.0218822956085205, "rewards/margins": 1.4680941104888916, "rewards/rejected": 0.5537882447242737, "step": 3390 }, { "epoch": 1.83, "learning_rate": 5.930298807327629e-08, "logits/chosen": -2.1465020179748535, "logits/rejected": -2.1461408138275146, "logps/chosen": -0.7341710329055786, "logps/rejected": -2.233225107192993, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9290289878845215, "rewards/margins": 0.05037790536880493, "rewards/rejected": 0.8786510825157166, "step": 3391 }, { "epoch": 1.83, "learning_rate": 5.928153066287139e-08, "logits/chosen": -2.0310311317443848, "logits/rejected": -2.033586025238037, "logps/chosen": -4.169982433319092, "logps/rejected": -2.28061580657959, "loss": 0.321, "rewards/accuracies": 1.0, "rewards/chosen": 1.567856788635254, "rewards/margins": 0.9716032147407532, "rewards/rejected": 0.5962535738945007, "step": 3392 }, { "epoch": 1.83, "learning_rate": 5.9260071481953054e-08, "logits/chosen": -1.9980413913726807, "logits/rejected": -1.9972292184829712, "logps/chosen": -4.416202545166016, "logps/rejected": -0.5636749267578125, "loss": 0.7168, "rewards/accuracies": 0.0, "rewards/chosen": 0.8951333165168762, "rewards/margins": -0.04669445753097534, "rewards/rejected": 0.9418277740478516, "step": 3393 }, { "epoch": 1.83, "learning_rate": 5.9238610534614784e-08, "logits/chosen": -2.083890438079834, "logits/rejected": -2.2709546089172363, "logps/chosen": -0.8586791753768921, "logps/rejected": -2.172170400619507, "loss": 0.6225, "rewards/accuracies": 1.0, "rewards/chosen": 0.9478556513786316, "rewards/margins": 0.14656788110733032, "rewards/rejected": 0.8012877702713013, "step": 3394 }, { "epoch": 1.83, "learning_rate": 5.921714782495041e-08, "logits/chosen": -2.0547587871551514, "logits/rejected": -2.2700066566467285, "logps/chosen": -0.25232791900634766, "logps/rejected": -0.2910810112953186, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8970298171043396, "rewards/margins": 0.013582468032836914, "rewards/rejected": 0.8834473490715027, "step": 3395 }, { "epoch": 1.83, "learning_rate": 5.919568335705405e-08, "logits/chosen": -2.181640386581421, "logits/rejected": -2.302422523498535, "logps/chosen": -6.061115264892578, "logps/rejected": -1.6037598848342896, "loss": 0.7688, "rewards/accuracies": 0.0, "rewards/chosen": 0.7317231297492981, "rewards/margins": -0.14591693878173828, "rewards/rejected": 0.8776400685310364, "step": 3396 }, { "epoch": 1.83, "learning_rate": 5.917421713502023e-08, "logits/chosen": -2.020822763442993, "logits/rejected": -2.2739627361297607, "logps/chosen": -0.9889042973518372, "logps/rejected": -1.0502973794937134, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8639727830886841, "rewards/margins": 0.012093544006347656, "rewards/rejected": 0.8518792390823364, "step": 3397 }, { "epoch": 1.83, "learning_rate": 5.915274916294377e-08, "logits/chosen": -2.0802550315856934, "logits/rejected": -2.216770648956299, "logps/chosen": -6.968913555145264, "logps/rejected": -4.79872465133667, "loss": 0.5999, "rewards/accuracies": 1.0, "rewards/chosen": 1.0607601404190063, "rewards/margins": 0.196153461933136, "rewards/rejected": 0.8646066784858704, "step": 3398 }, { "epoch": 1.83, "learning_rate": 5.913127944491981e-08, "logits/chosen": -1.98583984375, "logits/rejected": -2.2924559116363525, "logps/chosen": -0.6443578004837036, "logps/rejected": -3.320094347000122, "loss": 0.5489, "rewards/accuracies": 1.0, "rewards/chosen": 1.0087335109710693, "rewards/margins": 0.31294846534729004, "rewards/rejected": 0.6957850456237793, "step": 3399 }, { "epoch": 1.83, "learning_rate": 5.9109807985043856e-08, "logits/chosen": -2.065286874771118, "logits/rejected": -2.31223201751709, "logps/chosen": -7.153066158294678, "logps/rejected": -8.174762725830078, "loss": 0.7098, "rewards/accuracies": 0.0, "rewards/chosen": 1.0486927032470703, "rewards/margins": -0.033008575439453125, "rewards/rejected": 1.0817012786865234, "step": 3400 }, { "epoch": 1.83, "learning_rate": 5.9088334787411734e-08, "logits/chosen": -2.069939613342285, "logits/rejected": -2.079400062561035, "logps/chosen": -1.9318965673446655, "logps/rejected": -2.151496648788452, "loss": 0.4909, "rewards/accuracies": 1.0, "rewards/chosen": 1.1556307077407837, "rewards/margins": 0.4560474753379822, "rewards/rejected": 0.6995832324028015, "step": 3401 }, { "epoch": 1.83, "learning_rate": 5.906685985611955e-08, "logits/chosen": -2.1397011280059814, "logits/rejected": -2.2664389610290527, "logps/chosen": -0.4745177626609802, "logps/rejected": -0.6189069747924805, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.936725914478302, "rewards/margins": 0.01276552677154541, "rewards/rejected": 0.9239603877067566, "step": 3402 }, { "epoch": 1.84, "learning_rate": 5.904538319526386e-08, "logits/chosen": -2.2025060653686523, "logits/rejected": -2.2018494606018066, "logps/chosen": -2.0781192779541016, "logps/rejected": -4.830540180206299, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 1.0599271059036255, "rewards/margins": 0.530656099319458, "rewards/rejected": 0.5292710065841675, "step": 3403 }, { "epoch": 1.84, "learning_rate": 5.902390480894145e-08, "logits/chosen": -2.0188472270965576, "logits/rejected": -2.019306182861328, "logps/chosen": -0.45323318243026733, "logps/rejected": -4.470530986785889, "loss": 0.4947, "rewards/accuracies": 1.0, "rewards/chosen": 0.8432415127754211, "rewards/margins": 0.4462442994117737, "rewards/rejected": 0.39699721336364746, "step": 3404 }, { "epoch": 1.84, "learning_rate": 5.9002424701249456e-08, "logits/chosen": -1.932620644569397, "logits/rejected": -2.257969617843628, "logps/chosen": -3.6049981117248535, "logps/rejected": -3.56557035446167, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.7500311732292175, "rewards/margins": 0.01883375644683838, "rewards/rejected": 0.7311974167823792, "step": 3405 }, { "epoch": 1.84, "learning_rate": 5.8980942876285354e-08, "logits/chosen": -2.122670888900757, "logits/rejected": -2.09771990776062, "logps/chosen": -16.08179473876953, "logps/rejected": -8.349065780639648, "loss": 0.1897, "rewards/accuracies": 1.0, "rewards/chosen": 1.7937580347061157, "rewards/margins": 1.565976858139038, "rewards/rejected": 0.22778120636940002, "step": 3406 }, { "epoch": 1.84, "learning_rate": 5.8959459338146944e-08, "logits/chosen": -2.1697781085968018, "logits/rejected": -2.313109874725342, "logps/chosen": -3.804105520248413, "logps/rejected": -3.455519437789917, "loss": 0.7008, "rewards/accuracies": 0.0, "rewards/chosen": 0.92481929063797, "rewards/margins": -0.015204548835754395, "rewards/rejected": 0.9400238394737244, "step": 3407 }, { "epoch": 1.84, "learning_rate": 5.893797409093236e-08, "logits/chosen": -2.1444053649902344, "logits/rejected": -2.300477981567383, "logps/chosen": -7.9081711769104, "logps/rejected": -4.65346622467041, "loss": 0.7355, "rewards/accuracies": 0.0, "rewards/chosen": 0.7728656530380249, "rewards/margins": -0.08297878503799438, "rewards/rejected": 0.8558444380760193, "step": 3408 }, { "epoch": 1.84, "learning_rate": 5.8916487138740055e-08, "logits/chosen": -2.122845411300659, "logits/rejected": -2.217535972595215, "logps/chosen": -1.0867120027542114, "logps/rejected": -1.1764981746673584, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 0.9097282290458679, "rewards/margins": 0.038012027740478516, "rewards/rejected": 0.8717162013053894, "step": 3409 }, { "epoch": 1.84, "learning_rate": 5.889499848566881e-08, "logits/chosen": -2.1636898517608643, "logits/rejected": -2.1694793701171875, "logps/chosen": -2.405118465423584, "logps/rejected": -3.7447092533111572, "loss": 0.3815, "rewards/accuracies": 1.0, "rewards/chosen": 1.2702867984771729, "rewards/margins": 0.76670241355896, "rewards/rejected": 0.5035843849182129, "step": 3410 }, { "epoch": 1.84, "learning_rate": 5.8873508135817707e-08, "logits/chosen": -2.1436567306518555, "logits/rejected": -2.1461241245269775, "logps/chosen": -2.4105477333068848, "logps/rejected": -1.2858099937438965, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 1.014369249343872, "rewards/margins": 0.04693758487701416, "rewards/rejected": 0.9674316644668579, "step": 3411 }, { "epoch": 1.84, "learning_rate": 5.885201609328621e-08, "logits/chosen": -2.1340062618255615, "logits/rejected": -2.141117572784424, "logps/chosen": -3.1363439559936523, "logps/rejected": -14.888474464416504, "loss": 0.3324, "rewards/accuracies": 1.0, "rewards/chosen": 1.0747588872909546, "rewards/margins": 0.9306572675704956, "rewards/rejected": 0.14410161972045898, "step": 3412 }, { "epoch": 1.84, "learning_rate": 5.883052236217401e-08, "logits/chosen": -1.9633159637451172, "logits/rejected": -2.263571262359619, "logps/chosen": -2.0122883319854736, "logps/rejected": -2.0001180171966553, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.7699678540229797, "rewards/margins": 0.021225929260253906, "rewards/rejected": 0.7487419247627258, "step": 3413 }, { "epoch": 1.84, "learning_rate": 5.880902694658123e-08, "logits/chosen": -1.9897857904434204, "logits/rejected": -1.9935003519058228, "logps/chosen": -2.0375406742095947, "logps/rejected": -2.6240408420562744, "loss": 0.5733, "rewards/accuracies": 1.0, "rewards/chosen": 1.0104519128799438, "rewards/margins": 0.25613319873809814, "rewards/rejected": 0.7543187141418457, "step": 3414 }, { "epoch": 1.84, "learning_rate": 5.8787529850608256e-08, "logits/chosen": -2.171445369720459, "logits/rejected": -2.1650784015655518, "logps/chosen": -4.18651008605957, "logps/rejected": -4.622015953063965, "loss": 0.6171, "rewards/accuracies": 1.0, "rewards/chosen": 0.8333329558372498, "rewards/margins": 0.15844941139221191, "rewards/rejected": 0.6748835444450378, "step": 3415 }, { "epoch": 1.84, "learning_rate": 5.8766031078355793e-08, "logits/chosen": -2.1342906951904297, "logits/rejected": -2.2995941638946533, "logps/chosen": -0.7523880004882812, "logps/rejected": -0.7939065098762512, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 1.0121670961380005, "rewards/margins": -0.0023941993713378906, "rewards/rejected": 1.0145612955093384, "step": 3416 }, { "epoch": 1.84, "learning_rate": 5.8744530633924874e-08, "logits/chosen": -2.0551488399505615, "logits/rejected": -2.2566421031951904, "logps/chosen": -0.7037832140922546, "logps/rejected": -0.6611053347587585, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.7257153987884521, "rewards/margins": 0.014164924621582031, "rewards/rejected": 0.7115504741668701, "step": 3417 }, { "epoch": 1.84, "learning_rate": 5.872302852141684e-08, "logits/chosen": -2.1066267490386963, "logits/rejected": -2.103839874267578, "logps/chosen": -3.7532145977020264, "logps/rejected": -2.059126377105713, "loss": 0.5543, "rewards/accuracies": 1.0, "rewards/chosen": 1.1140774488449097, "rewards/margins": 0.3001587390899658, "rewards/rejected": 0.8139187097549438, "step": 3418 }, { "epoch": 1.84, "learning_rate": 5.8701524744933375e-08, "logits/chosen": -1.9532967805862427, "logits/rejected": -2.224453926086426, "logps/chosen": -0.29460853338241577, "logps/rejected": -0.3309670090675354, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 0.9399227499961853, "rewards/margins": -0.01847618818283081, "rewards/rejected": 0.9583989381790161, "step": 3419 }, { "epoch": 1.84, "learning_rate": 5.8680019308576455e-08, "logits/chosen": -2.2367279529571533, "logits/rejected": -2.053938865661621, "logps/chosen": -58.35523223876953, "logps/rejected": -0.3561406135559082, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 2.705886125564575, "rewards/margins": 1.8600127696990967, "rewards/rejected": 0.8458732962608337, "step": 3420 }, { "epoch": 1.85, "learning_rate": 5.865851221644841e-08, "logits/chosen": -2.230963945388794, "logits/rejected": -2.080913782119751, "logps/chosen": -35.040283203125, "logps/rejected": -1.605297327041626, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": 2.7553372383117676, "rewards/margins": 1.7909965515136719, "rewards/rejected": 0.9643406867980957, "step": 3421 }, { "epoch": 1.85, "learning_rate": 5.863700347265184e-08, "logits/chosen": -2.0706145763397217, "logits/rejected": -2.2754223346710205, "logps/chosen": -0.3784860074520111, "logps/rejected": -0.3608957827091217, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8633020520210266, "rewards/margins": 0.02289789915084839, "rewards/rejected": 0.8404041528701782, "step": 3422 }, { "epoch": 1.85, "learning_rate": 5.8615493081289667e-08, "logits/chosen": -2.048994302749634, "logits/rejected": -2.317683696746826, "logps/chosen": -0.5117603540420532, "logps/rejected": -0.5224511027336121, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.8276896476745605, "rewards/margins": 0.017655670642852783, "rewards/rejected": 0.8100339770317078, "step": 3423 }, { "epoch": 1.85, "learning_rate": 5.859398104646518e-08, "logits/chosen": -2.0678446292877197, "logits/rejected": -2.3026182651519775, "logps/chosen": -1.2114893198013306, "logps/rejected": -1.1786258220672607, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 1.1122993230819702, "rewards/margins": 0.003107905387878418, "rewards/rejected": 1.1091914176940918, "step": 3424 }, { "epoch": 1.85, "learning_rate": 5.8572467372281896e-08, "logits/chosen": -1.9954859018325806, "logits/rejected": -2.2632718086242676, "logps/chosen": -0.20351964235305786, "logps/rejected": -0.2315487265586853, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.7990749478340149, "rewards/margins": 0.0018242597579956055, "rewards/rejected": 0.7972506880760193, "step": 3425 }, { "epoch": 1.85, "learning_rate": 5.855095206284373e-08, "logits/chosen": -1.99443519115448, "logits/rejected": -1.997338056564331, "logps/chosen": -0.9958165884017944, "logps/rejected": -5.203180313110352, "loss": 0.5612, "rewards/accuracies": 1.0, "rewards/chosen": 0.809415340423584, "rewards/margins": 0.2838837504386902, "rewards/rejected": 0.5255315899848938, "step": 3426 }, { "epoch": 1.85, "learning_rate": 5.8529435122254857e-08, "logits/chosen": -2.06484317779541, "logits/rejected": -2.113394021987915, "logps/chosen": -2.962045907974243, "logps/rejected": -18.436594009399414, "loss": 0.2, "rewards/accuracies": 1.0, "rewards/chosen": 1.5105043649673462, "rewards/margins": 1.507860541343689, "rewards/rejected": 0.0026437758933752775, "step": 3427 }, { "epoch": 1.85, "learning_rate": 5.850791655461976e-08, "logits/chosen": -2.0986549854278564, "logits/rejected": -2.3353099822998047, "logps/chosen": -6.643024444580078, "logps/rejected": -6.993477821350098, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9585453271865845, "rewards/margins": 0.016308724880218506, "rewards/rejected": 0.942236602306366, "step": 3428 }, { "epoch": 1.85, "learning_rate": 5.8486396364043265e-08, "logits/chosen": -2.0468616485595703, "logits/rejected": -2.27089786529541, "logps/chosen": -0.27901336550712585, "logps/rejected": -0.2744554877281189, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.9637514352798462, "rewards/margins": 0.009427964687347412, "rewards/rejected": 0.9543234705924988, "step": 3429 }, { "epoch": 1.85, "learning_rate": 5.8464874554630484e-08, "logits/chosen": -2.14373517036438, "logits/rejected": -2.299905300140381, "logps/chosen": -1.818418264389038, "logps/rejected": -1.7333581447601318, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.9714263081550598, "rewards/margins": -0.002882838249206543, "rewards/rejected": 0.9743091464042664, "step": 3430 }, { "epoch": 1.85, "learning_rate": 5.8443351130486854e-08, "logits/chosen": -2.0337796211242676, "logits/rejected": -2.269071102142334, "logps/chosen": -0.3619040548801422, "logps/rejected": -0.39355897903442383, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.9522069096565247, "rewards/margins": 0.00011414289474487305, "rewards/rejected": 0.9520927667617798, "step": 3431 }, { "epoch": 1.85, "learning_rate": 5.8421826095718094e-08, "logits/chosen": -2.0972883701324463, "logits/rejected": -2.0012972354888916, "logps/chosen": -5.506025314331055, "logps/rejected": -4.640190124511719, "loss": 0.363, "rewards/accuracies": 1.0, "rewards/chosen": 1.6389217376708984, "rewards/margins": 0.826429545879364, "rewards/rejected": 0.8124921917915344, "step": 3432 }, { "epoch": 1.85, "learning_rate": 5.840029945443027e-08, "logits/chosen": -2.1426262855529785, "logits/rejected": -2.2666125297546387, "logps/chosen": -6.09939432144165, "logps/rejected": -5.152909278869629, "loss": 0.6677, "rewards/accuracies": 1.0, "rewards/chosen": 1.006736159324646, "rewards/margins": 0.05161327123641968, "rewards/rejected": 0.9551228880882263, "step": 3433 }, { "epoch": 1.85, "learning_rate": 5.8378771210729707e-08, "logits/chosen": -2.05351185798645, "logits/rejected": -2.0537869930267334, "logps/chosen": -5.533180236816406, "logps/rejected": -4.710904121398926, "loss": 0.4685, "rewards/accuracies": 1.0, "rewards/chosen": 1.1655899286270142, "rewards/margins": 0.5148066878318787, "rewards/rejected": 0.6507832407951355, "step": 3434 }, { "epoch": 1.85, "learning_rate": 5.835724136872306e-08, "logits/chosen": -2.032059907913208, "logits/rejected": -1.9984724521636963, "logps/chosen": -5.445085048675537, "logps/rejected": -3.635500431060791, "loss": 0.3218, "rewards/accuracies": 1.0, "rewards/chosen": 1.4571272134780884, "rewards/margins": 0.9685322046279907, "rewards/rejected": 0.48859497904777527, "step": 3435 }, { "epoch": 1.85, "learning_rate": 5.833570993251731e-08, "logits/chosen": -2.140561103820801, "logits/rejected": -2.1419589519500732, "logps/chosen": -3.5243771076202393, "logps/rejected": -4.097330570220947, "loss": 0.5714, "rewards/accuracies": 1.0, "rewards/chosen": 0.9991364479064941, "rewards/margins": 0.2605012059211731, "rewards/rejected": 0.738635241985321, "step": 3436 }, { "epoch": 1.85, "learning_rate": 5.831417690621972e-08, "logits/chosen": -2.06520938873291, "logits/rejected": -2.391806125640869, "logps/chosen": -5.908322334289551, "logps/rejected": -18.87112045288086, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.7691258788108826, "rewards/margins": 0.008471429347991943, "rewards/rejected": 0.7606544494628906, "step": 3437 }, { "epoch": 1.85, "learning_rate": 5.829264229393783e-08, "logits/chosen": -2.203526735305786, "logits/rejected": -2.2032132148742676, "logps/chosen": -3.9564692974090576, "logps/rejected": -4.271234512329102, "loss": 0.4877, "rewards/accuracies": 1.0, "rewards/chosen": 1.4689687490463257, "rewards/margins": 0.464300274848938, "rewards/rejected": 1.0046684741973877, "step": 3438 }, { "epoch": 1.85, "learning_rate": 5.8271106099779555e-08, "logits/chosen": -2.038006067276001, "logits/rejected": -2.252920150756836, "logps/chosen": -10.455883026123047, "logps/rejected": -10.864219665527344, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0338010787963867, "rewards/margins": 0.01912248134613037, "rewards/rejected": 1.0146785974502563, "step": 3439 }, { "epoch": 1.86, "learning_rate": 5.8249568327853026e-08, "logits/chosen": -2.0419983863830566, "logits/rejected": -2.330702304840088, "logps/chosen": -3.974740982055664, "logps/rejected": -3.880317211151123, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.5819898843765259, "rewards/margins": 0.019843757152557373, "rewards/rejected": 0.5621461272239685, "step": 3440 }, { "epoch": 1.86, "learning_rate": 5.8228028982266706e-08, "logits/chosen": -2.1153786182403564, "logits/rejected": -2.3383870124816895, "logps/chosen": -2.2680375576019287, "logps/rejected": -6.243518352508545, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": 1.0033892393112183, "rewards/margins": 0.0740862488746643, "rewards/rejected": 0.929302990436554, "step": 3441 }, { "epoch": 1.86, "learning_rate": 5.820648806712942e-08, "logits/chosen": -2.082491636276245, "logits/rejected": -2.338942050933838, "logps/chosen": -1.189258337020874, "logps/rejected": -1.1002219915390015, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.8084436655044556, "rewards/margins": 0.021761596202850342, "rewards/rejected": 0.7866820693016052, "step": 3442 }, { "epoch": 1.86, "learning_rate": 5.8184945586550206e-08, "logits/chosen": -2.102064371109009, "logits/rejected": -2.2705438137054443, "logps/chosen": -9.421661376953125, "logps/rejected": -0.8559615015983582, "loss": 0.7472, "rewards/accuracies": 0.0, "rewards/chosen": 0.8313698172569275, "rewards/margins": -0.10524868965148926, "rewards/rejected": 0.9366185069084167, "step": 3443 }, { "epoch": 1.86, "learning_rate": 5.8163401544638435e-08, "logits/chosen": -2.0133159160614014, "logits/rejected": -2.247847318649292, "logps/chosen": -0.3283810019493103, "logps/rejected": -0.3498832881450653, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 1.0065768957138062, "rewards/margins": 0.010489404201507568, "rewards/rejected": 0.9960874915122986, "step": 3444 }, { "epoch": 1.86, "learning_rate": 5.814185594550377e-08, "logits/chosen": -2.086597204208374, "logits/rejected": -2.0879640579223633, "logps/chosen": -0.3734760582447052, "logps/rejected": -3.3637590408325195, "loss": 0.4912, "rewards/accuracies": 1.0, "rewards/chosen": 1.081976294517517, "rewards/margins": 0.45521247386932373, "rewards/rejected": 0.6267638206481934, "step": 3445 }, { "epoch": 1.86, "learning_rate": 5.81203087932562e-08, "logits/chosen": -2.0837554931640625, "logits/rejected": -2.09047794342041, "logps/chosen": -2.2267582416534424, "logps/rejected": -3.741405487060547, "loss": 0.4291, "rewards/accuracies": 1.0, "rewards/chosen": 1.204722285270691, "rewards/margins": 0.6239843964576721, "rewards/rejected": 0.5807378888130188, "step": 3446 }, { "epoch": 1.86, "learning_rate": 5.809876009200597e-08, "logits/chosen": -2.0994620323181152, "logits/rejected": -2.0995559692382812, "logps/chosen": -0.8734638690948486, "logps/rejected": -3.8572025299072266, "loss": 0.4937, "rewards/accuracies": 1.0, "rewards/chosen": 1.087737798690796, "rewards/margins": 0.4489355683326721, "rewards/rejected": 0.6388022303581238, "step": 3447 }, { "epoch": 1.86, "learning_rate": 5.807720984586364e-08, "logits/chosen": -2.1577742099761963, "logits/rejected": -2.314704179763794, "logps/chosen": -0.9599155187606812, "logps/rejected": -0.9738329648971558, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.9738689661026001, "rewards/margins": -0.0006822347640991211, "rewards/rejected": 0.9745512008666992, "step": 3448 }, { "epoch": 1.86, "learning_rate": 5.805565805894007e-08, "logits/chosen": -2.0951008796691895, "logits/rejected": -2.100605010986328, "logps/chosen": -2.4026880264282227, "logps/rejected": -3.45408296585083, "loss": 0.503, "rewards/accuracies": 1.0, "rewards/chosen": 1.013595700263977, "rewards/margins": 0.4251251816749573, "rewards/rejected": 0.5884705185890198, "step": 3449 }, { "epoch": 1.86, "learning_rate": 5.80341047353464e-08, "logits/chosen": -2.154216766357422, "logits/rejected": -2.308722972869873, "logps/chosen": -0.3565250337123871, "logps/rejected": -15.779742240905762, "loss": 0.7283, "rewards/accuracies": 0.0, "rewards/chosen": 0.8475490808486938, "rewards/margins": -0.06901806592941284, "rewards/rejected": 0.9165671467781067, "step": 3450 }, { "epoch": 1.86, "learning_rate": 5.8012549879194064e-08, "logits/chosen": -2.0235819816589355, "logits/rejected": -2.032501697540283, "logps/chosen": -1.5790205001831055, "logps/rejected": -2.4858007431030273, "loss": 0.4295, "rewards/accuracies": 1.0, "rewards/chosen": 1.2771605253219604, "rewards/margins": 0.6226193904876709, "rewards/rejected": 0.6545411348342896, "step": 3451 }, { "epoch": 1.86, "learning_rate": 5.799099349459479e-08, "logits/chosen": -2.1240949630737305, "logits/rejected": -2.1266872882843018, "logps/chosen": -3.5359511375427246, "logps/rejected": -7.734814643859863, "loss": 0.4967, "rewards/accuracies": 1.0, "rewards/chosen": 1.2670973539352417, "rewards/margins": 0.44128185510635376, "rewards/rejected": 0.8258154988288879, "step": 3452 }, { "epoch": 1.86, "learning_rate": 5.796943558566061e-08, "logits/chosen": -2.2367944717407227, "logits/rejected": -2.1254851818084717, "logps/chosen": -27.58199691772461, "logps/rejected": -4.524842262268066, "loss": 0.1359, "rewards/accuracies": 1.0, "rewards/chosen": 2.4098281860351562, "rewards/margins": 1.926742434501648, "rewards/rejected": 0.4830857217311859, "step": 3453 }, { "epoch": 1.86, "learning_rate": 5.794787615650385e-08, "logits/chosen": -2.100613594055176, "logits/rejected": -2.3141965866088867, "logps/chosen": -0.47372397780418396, "logps/rejected": -0.5562621355056763, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9328011870384216, "rewards/margins": 0.018184959888458252, "rewards/rejected": 0.9146162271499634, "step": 3454 }, { "epoch": 1.86, "learning_rate": 5.7926315211237096e-08, "logits/chosen": -2.040154457092285, "logits/rejected": -2.0381946563720703, "logps/chosen": -6.768692970275879, "logps/rejected": -4.97011661529541, "loss": 0.3615, "rewards/accuracies": 1.0, "rewards/chosen": 1.2277781963348389, "rewards/margins": 0.8312978744506836, "rewards/rejected": 0.3964802920818329, "step": 3455 }, { "epoch": 1.86, "learning_rate": 5.790475275397324e-08, "logits/chosen": -2.139662265777588, "logits/rejected": -2.2990708351135254, "logps/chosen": -0.7369317412376404, "logps/rejected": -1.2475966215133667, "loss": 0.6322, "rewards/accuracies": 1.0, "rewards/chosen": 0.9434975981712341, "rewards/margins": 0.12595295906066895, "rewards/rejected": 0.8175446391105652, "step": 3456 }, { "epoch": 1.86, "learning_rate": 5.7883188788825475e-08, "logits/chosen": -2.1393117904663086, "logits/rejected": -2.1356120109558105, "logps/chosen": -7.33495569229126, "logps/rejected": -3.3384199142456055, "loss": 0.3416, "rewards/accuracies": 1.0, "rewards/chosen": 1.5389413833618164, "rewards/margins": 0.8985374569892883, "rewards/rejected": 0.6404039263725281, "step": 3457 }, { "epoch": 1.87, "learning_rate": 5.786162331990726e-08, "logits/chosen": -2.038942813873291, "logits/rejected": -2.3069005012512207, "logps/chosen": -0.4096411466598511, "logps/rejected": -0.360270619392395, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.9555441737174988, "rewards/margins": 0.019357383251190186, "rewards/rejected": 0.9361867904663086, "step": 3458 }, { "epoch": 1.87, "learning_rate": 5.784005635133236e-08, "logits/chosen": -2.161888360977173, "logits/rejected": -2.241760730743408, "logps/chosen": -4.910516262054443, "logps/rejected": -15.635002136230469, "loss": 0.5078, "rewards/accuracies": 1.0, "rewards/chosen": 1.1616991758346558, "rewards/margins": 0.4130169749259949, "rewards/rejected": 0.7486822009086609, "step": 3459 }, { "epoch": 1.87, "learning_rate": 5.781848788721481e-08, "logits/chosen": -2.045487403869629, "logits/rejected": -2.1142730712890625, "logps/chosen": -2.9929521083831787, "logps/rejected": -25.443920135498047, "loss": 0.1692, "rewards/accuracies": 1.0, "rewards/chosen": 1.6361852884292603, "rewards/margins": 1.6910181045532227, "rewards/rejected": -0.05483284220099449, "step": 3460 }, { "epoch": 1.87, "learning_rate": 5.779691793166893e-08, "logits/chosen": -2.0564780235290527, "logits/rejected": -2.2752492427825928, "logps/chosen": -0.363065242767334, "logps/rejected": -0.40293464064598083, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.8241731524467468, "rewards/margins": 0.003721952438354492, "rewards/rejected": 0.8204512000083923, "step": 3461 }, { "epoch": 1.87, "learning_rate": 5.7775346488809364e-08, "logits/chosen": -2.2091166973114014, "logits/rejected": -2.2072463035583496, "logps/chosen": -1.0174897909164429, "logps/rejected": -6.521155834197998, "loss": 0.4308, "rewards/accuracies": 1.0, "rewards/chosen": 1.0421632528305054, "rewards/margins": 0.6189893484115601, "rewards/rejected": 0.4231738746166229, "step": 3462 }, { "epoch": 1.87, "learning_rate": 5.7753773562750975e-08, "logits/chosen": -2.1498043537139893, "logits/rejected": -2.3100969791412354, "logps/chosen": -24.28032112121582, "logps/rejected": -7.962828636169434, "loss": 0.5788, "rewards/accuracies": 1.0, "rewards/chosen": 1.185683250427246, "rewards/margins": 0.24357402324676514, "rewards/rejected": 0.942109227180481, "step": 3463 }, { "epoch": 1.87, "learning_rate": 5.773219915760894e-08, "logits/chosen": -1.9793622493743896, "logits/rejected": -1.9890975952148438, "logps/chosen": -3.3487775325775146, "logps/rejected": -4.159550666809082, "loss": 0.4005, "rewards/accuracies": 1.0, "rewards/chosen": 1.3811887502670288, "rewards/margins": 0.7080031037330627, "rewards/rejected": 0.6731856465339661, "step": 3464 }, { "epoch": 1.87, "learning_rate": 5.7710623277498736e-08, "logits/chosen": -1.9923573732376099, "logits/rejected": -2.2671103477478027, "logps/chosen": -2.1001267433166504, "logps/rejected": -1.6773240566253662, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.8872987627983093, "rewards/margins": 0.004459738731384277, "rewards/rejected": 0.882839024066925, "step": 3465 }, { "epoch": 1.87, "learning_rate": 5.768904592653611e-08, "logits/chosen": -2.0326366424560547, "logits/rejected": -2.3242897987365723, "logps/chosen": -0.6698380708694458, "logps/rejected": -5.206633567810059, "loss": 0.6215, "rewards/accuracies": 1.0, "rewards/chosen": 1.056083083152771, "rewards/margins": 0.14883160591125488, "rewards/rejected": 0.9072514772415161, "step": 3466 }, { "epoch": 1.87, "learning_rate": 5.7667467108837066e-08, "logits/chosen": -2.085610866546631, "logits/rejected": -2.094587564468384, "logps/chosen": -3.7988100051879883, "logps/rejected": -11.282506942749023, "loss": 0.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.8377783894538879, "rewards/margins": 0.04302489757537842, "rewards/rejected": 0.7947534918785095, "step": 3467 }, { "epoch": 1.87, "learning_rate": 5.764588682851791e-08, "logits/chosen": -2.0509395599365234, "logits/rejected": -2.285891056060791, "logps/chosen": -0.28604021668434143, "logps/rejected": -0.34097418189048767, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.9788013696670532, "rewards/margins": 0.00035762786865234375, "rewards/rejected": 0.9784437417984009, "step": 3468 }, { "epoch": 1.87, "learning_rate": 5.762430508969524e-08, "logits/chosen": -2.0383832454681396, "logits/rejected": -2.3394453525543213, "logps/chosen": -0.33026397228240967, "logps/rejected": -0.3174091875553131, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8573700785636902, "rewards/margins": 0.004349887371063232, "rewards/rejected": 0.853020191192627, "step": 3469 }, { "epoch": 1.87, "learning_rate": 5.76027218964859e-08, "logits/chosen": -2.0908117294311523, "logits/rejected": -2.283806085586548, "logps/chosen": -1.5158412456512451, "logps/rejected": -6.249507904052734, "loss": 0.5674, "rewards/accuracies": 1.0, "rewards/chosen": 0.7949180603027344, "rewards/margins": 0.2695029377937317, "rewards/rejected": 0.5254151225090027, "step": 3470 }, { "epoch": 1.87, "learning_rate": 5.7581137253007026e-08, "logits/chosen": -2.0547704696655273, "logits/rejected": -2.2094197273254395, "logps/chosen": -1.0661431550979614, "logps/rejected": -2.0481252670288086, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 1.0368276834487915, "rewards/margins": -0.014002084732055664, "rewards/rejected": 1.0508297681808472, "step": 3471 }, { "epoch": 1.87, "learning_rate": 5.755955116337604e-08, "logits/chosen": -2.0218422412872314, "logits/rejected": -2.0125184059143066, "logps/chosen": -1.875625729560852, "logps/rejected": -3.324044704437256, "loss": 0.4834, "rewards/accuracies": 1.0, "rewards/chosen": 1.1820324659347534, "rewards/margins": 0.4755105972290039, "rewards/rejected": 0.7065218687057495, "step": 3472 }, { "epoch": 1.87, "learning_rate": 5.753796363171063e-08, "logits/chosen": -2.126901388168335, "logits/rejected": -2.325321912765503, "logps/chosen": -1.1431407928466797, "logps/rejected": -1.205565333366394, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.8871588110923767, "rewards/margins": -0.0015597939491271973, "rewards/rejected": 0.8887186050415039, "step": 3473 }, { "epoch": 1.87, "learning_rate": 5.7516374662128756e-08, "logits/chosen": -2.071681499481201, "logits/rejected": -2.071535348892212, "logps/chosen": -2.525329351425171, "logps/rejected": -3.847963333129883, "loss": 0.3456, "rewards/accuracies": 1.0, "rewards/chosen": 1.4580916166305542, "rewards/margins": 0.8847108483314514, "rewards/rejected": 0.5733807682991028, "step": 3474 }, { "epoch": 1.87, "learning_rate": 5.7494784258748644e-08, "logits/chosen": -1.9990979433059692, "logits/rejected": -2.0072624683380127, "logps/chosen": -6.782487869262695, "logps/rejected": -1.5247896909713745, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 1.154918909072876, "rewards/margins": 0.010270118713378906, "rewards/rejected": 1.144648790359497, "step": 3475 }, { "epoch": 1.87, "learning_rate": 5.747319242568883e-08, "logits/chosen": -2.010866165161133, "logits/rejected": -2.0074825286865234, "logps/chosen": -7.530473709106445, "logps/rejected": -4.020512580871582, "loss": 0.3228, "rewards/accuracies": 1.0, "rewards/chosen": 1.4171863794326782, "rewards/margins": 0.9649026393890381, "rewards/rejected": 0.4522837698459625, "step": 3476 }, { "epoch": 1.88, "learning_rate": 5.745159916706811e-08, "logits/chosen": -2.0474789142608643, "logits/rejected": -2.2806010246276855, "logps/chosen": -0.2368636280298233, "logps/rejected": -0.25430163741111755, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.837527871131897, "rewards/margins": 0.025911331176757812, "rewards/rejected": 0.8116165399551392, "step": 3477 }, { "epoch": 1.88, "learning_rate": 5.743000448700551e-08, "logits/chosen": -2.1335015296936035, "logits/rejected": -2.338721990585327, "logps/chosen": -0.9082865118980408, "logps/rejected": -0.8382911682128906, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 1.0181899070739746, "rewards/margins": 0.010605454444885254, "rewards/rejected": 1.0075844526290894, "step": 3478 }, { "epoch": 1.88, "learning_rate": 5.7408408389620364e-08, "logits/chosen": -2.1893539428710938, "logits/rejected": -2.188289165496826, "logps/chosen": -0.27878934144973755, "logps/rejected": -4.326995372772217, "loss": 0.4462, "rewards/accuracies": 1.0, "rewards/chosen": 0.9426161050796509, "rewards/margins": 0.5757321119308472, "rewards/rejected": 0.3668839931488037, "step": 3479 }, { "epoch": 1.88, "learning_rate": 5.7386810879032286e-08, "logits/chosen": -2.1709680557250977, "logits/rejected": -2.1393916606903076, "logps/chosen": -21.24112319946289, "logps/rejected": -4.4696550369262695, "loss": 0.255, "rewards/accuracies": 1.0, "rewards/chosen": 1.75944983959198, "rewards/margins": 1.2361854314804077, "rewards/rejected": 0.5232644081115723, "step": 3480 }, { "epoch": 1.88, "learning_rate": 5.736521195936111e-08, "logits/chosen": -1.973813772201538, "logits/rejected": -2.2282087802886963, "logps/chosen": -0.39544785022735596, "logps/rejected": -0.3762659728527069, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.8822707533836365, "rewards/margins": 0.027013421058654785, "rewards/rejected": 0.8552573323249817, "step": 3481 }, { "epoch": 1.88, "learning_rate": 5.7343611634727017e-08, "logits/chosen": -2.060807943344116, "logits/rejected": -2.0594520568847656, "logps/chosen": -0.46671062707901, "logps/rejected": -1.937034010887146, "loss": 0.5544, "rewards/accuracies": 1.0, "rewards/chosen": 1.0020679235458374, "rewards/margins": 0.29988574981689453, "rewards/rejected": 0.7021821737289429, "step": 3482 }, { "epoch": 1.88, "learning_rate": 5.7322009909250404e-08, "logits/chosen": -2.1389739513397217, "logits/rejected": -2.136472225189209, "logps/chosen": -4.6562066078186035, "logps/rejected": -6.167760848999023, "loss": 0.4773, "rewards/accuracies": 1.0, "rewards/chosen": 1.0632985830307007, "rewards/margins": 0.49138569831848145, "rewards/rejected": 0.5719128847122192, "step": 3483 }, { "epoch": 1.88, "learning_rate": 5.730040678705191e-08, "logits/chosen": -2.1835920810699463, "logits/rejected": -2.301496744155884, "logps/chosen": -10.405210494995117, "logps/rejected": -12.927400588989258, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": 1.1005194187164307, "rewards/margins": -0.01109921932220459, "rewards/rejected": 1.1116186380386353, "step": 3484 }, { "epoch": 1.88, "learning_rate": 5.7278802272252505e-08, "logits/chosen": -2.126291036605835, "logits/rejected": -2.3543174266815186, "logps/chosen": -0.6702545285224915, "logps/rejected": -10.287359237670898, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 1.0508575439453125, "rewards/margins": 0.04502403736114502, "rewards/rejected": 1.0058335065841675, "step": 3485 }, { "epoch": 1.88, "learning_rate": 5.725719636897338e-08, "logits/chosen": -2.035475254058838, "logits/rejected": -2.2946178913116455, "logps/chosen": -0.4649749994277954, "logps/rejected": -0.4680611491203308, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 0.9902973175048828, "rewards/margins": 0.05354243516921997, "rewards/rejected": 0.9367548823356628, "step": 3486 }, { "epoch": 1.88, "learning_rate": 5.723558908133601e-08, "logits/chosen": -2.0094192028045654, "logits/rejected": -2.011085271835327, "logps/chosen": -0.7792384624481201, "logps/rejected": -6.510221481323242, "loss": 0.4691, "rewards/accuracies": 1.0, "rewards/chosen": 1.0917508602142334, "rewards/margins": 0.513167142868042, "rewards/rejected": 0.5785837173461914, "step": 3487 }, { "epoch": 1.88, "learning_rate": 5.7213980413462125e-08, "logits/chosen": -2.22025728225708, "logits/rejected": -2.3231542110443115, "logps/chosen": -0.2658320367336273, "logps/rejected": -0.26204463839530945, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9967054724693298, "rewards/margins": 0.007639884948730469, "rewards/rejected": 0.9890655875205994, "step": 3488 }, { "epoch": 1.88, "learning_rate": 5.719237036947373e-08, "logits/chosen": -2.066117763519287, "logits/rejected": -2.2636618614196777, "logps/chosen": -0.5366431474685669, "logps/rejected": -0.5262055397033691, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.9279711842536926, "rewards/margins": 0.008439719676971436, "rewards/rejected": 0.9195314645767212, "step": 3489 }, { "epoch": 1.88, "learning_rate": 5.7170758953493056e-08, "logits/chosen": -2.1338818073272705, "logits/rejected": -2.1365644931793213, "logps/chosen": -4.590586185455322, "logps/rejected": -0.3804084360599518, "loss": 0.583, "rewards/accuracies": 1.0, "rewards/chosen": 1.20912766456604, "rewards/margins": 0.23395788669586182, "rewards/rejected": 0.9751697778701782, "step": 3490 }, { "epoch": 1.88, "learning_rate": 5.714914616964266e-08, "logits/chosen": -2.0812859535217285, "logits/rejected": -2.080897808074951, "logps/chosen": -0.38372716307640076, "logps/rejected": -3.8769919872283936, "loss": 0.4771, "rewards/accuracies": 1.0, "rewards/chosen": 0.9898649454116821, "rewards/margins": 0.4919133484363556, "rewards/rejected": 0.49795159697532654, "step": 3491 }, { "epoch": 1.88, "learning_rate": 5.71275320220453e-08, "logits/chosen": -2.119436502456665, "logits/rejected": -2.0002667903900146, "logps/chosen": -15.167238235473633, "logps/rejected": -10.54205322265625, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 1.7261523008346558, "rewards/margins": 0.8194834589958191, "rewards/rejected": 0.9066688418388367, "step": 3492 }, { "epoch": 1.88, "learning_rate": 5.7105916514824026e-08, "logits/chosen": -2.0292630195617676, "logits/rejected": -2.0320935249328613, "logps/chosen": -5.993442058563232, "logps/rejected": -12.234163284301758, "loss": 0.38, "rewards/accuracies": 1.0, "rewards/chosen": 1.6764471530914307, "rewards/margins": 0.7716575860977173, "rewards/rejected": 0.9047895669937134, "step": 3493 }, { "epoch": 1.88, "learning_rate": 5.708429965210214e-08, "logits/chosen": -2.1501851081848145, "logits/rejected": -2.370161294937134, "logps/chosen": -0.7584909796714783, "logps/rejected": -5.798664093017578, "loss": 0.6111, "rewards/accuracies": 1.0, "rewards/chosen": 0.9373798370361328, "rewards/margins": 0.17144745588302612, "rewards/rejected": 0.7659323811531067, "step": 3494 }, { "epoch": 1.89, "learning_rate": 5.70626814380032e-08, "logits/chosen": -2.0665438175201416, "logits/rejected": -2.328719139099121, "logps/chosen": -1.586431860923767, "logps/rejected": -1.677395224571228, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 1.0229476690292358, "rewards/margins": 0.011905789375305176, "rewards/rejected": 1.0110418796539307, "step": 3495 }, { "epoch": 1.89, "learning_rate": 5.704106187665102e-08, "logits/chosen": -2.0783538818359375, "logits/rejected": -2.09466290473938, "logps/chosen": -1.3402541875839233, "logps/rejected": -3.4146854877471924, "loss": 0.3883, "rewards/accuracies": 1.0, "rewards/chosen": 1.5522021055221558, "rewards/margins": 0.7456252574920654, "rewards/rejected": 0.8065768480300903, "step": 3496 }, { "epoch": 1.89, "learning_rate": 5.7019440972169675e-08, "logits/chosen": -2.294243574142456, "logits/rejected": -2.397287130355835, "logps/chosen": -8.135627746582031, "logps/rejected": -13.476191520690918, "loss": 0.6018, "rewards/accuracies": 1.0, "rewards/chosen": 1.1662914752960205, "rewards/margins": 0.19198566675186157, "rewards/rejected": 0.9743058085441589, "step": 3497 }, { "epoch": 1.89, "learning_rate": 5.69978187286835e-08, "logits/chosen": -1.978934645652771, "logits/rejected": -2.2516186237335205, "logps/chosen": -0.20793524384498596, "logps/rejected": -0.23084768652915955, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.9402238726615906, "rewards/margins": 0.029165029525756836, "rewards/rejected": 0.9110588431358337, "step": 3498 }, { "epoch": 1.89, "learning_rate": 5.697619515031709e-08, "logits/chosen": -2.192828416824341, "logits/rejected": -2.0904624462127686, "logps/chosen": -25.49410057067871, "logps/rejected": -3.4427292346954346, "loss": 0.1592, "rewards/accuracies": 1.0, "rewards/chosen": 2.2319064140319824, "rewards/margins": 1.7568061351776123, "rewards/rejected": 0.47510024905204773, "step": 3499 }, { "epoch": 1.89, "learning_rate": 5.6954570241195264e-08, "logits/chosen": -1.9862638711929321, "logits/rejected": -2.313302516937256, "logps/chosen": -0.45233094692230225, "logps/rejected": -0.4969547986984253, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.9344663619995117, "rewards/margins": 0.03228527307510376, "rewards/rejected": 0.902181088924408, "step": 3500 }, { "epoch": 1.89, "learning_rate": 5.6932944005443126e-08, "logits/chosen": -2.1650915145874023, "logits/rejected": -2.292193651199341, "logps/chosen": -19.90188980102539, "logps/rejected": -2.1003031730651855, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.9890739321708679, "rewards/margins": -0.009447693824768066, "rewards/rejected": 0.998521625995636, "step": 3501 }, { "epoch": 1.89, "learning_rate": 5.691131644718603e-08, "logits/chosen": -2.1761059761047363, "logits/rejected": -2.1741139888763428, "logps/chosen": -6.96134614944458, "logps/rejected": -3.239957809448242, "loss": 0.2504, "rewards/accuracies": 1.0, "rewards/chosen": 1.7972278594970703, "rewards/margins": 1.256736159324646, "rewards/rejected": 0.5404917001724243, "step": 3502 }, { "epoch": 1.89, "learning_rate": 5.688968757054956e-08, "logits/chosen": -2.0628912448883057, "logits/rejected": -2.1498196125030518, "logps/chosen": -3.1070804595947266, "logps/rejected": -19.145671844482422, "loss": 0.4111, "rewards/accuracies": 1.0, "rewards/chosen": 1.3120195865631104, "rewards/margins": 0.6763635873794556, "rewards/rejected": 0.6356559991836548, "step": 3503 }, { "epoch": 1.89, "learning_rate": 5.6868057379659594e-08, "logits/chosen": -2.1568753719329834, "logits/rejected": -2.152787923812866, "logps/chosen": -4.2317070960998535, "logps/rejected": -4.472555637359619, "loss": 0.2508, "rewards/accuracies": 1.0, "rewards/chosen": 1.6757984161376953, "rewards/margins": 1.2552945613861084, "rewards/rejected": 0.4205038547515869, "step": 3504 }, { "epoch": 1.89, "learning_rate": 5.684642587864221e-08, "logits/chosen": -2.050581216812134, "logits/rejected": -2.0293538570404053, "logps/chosen": -14.0223970413208, "logps/rejected": -4.851287841796875, "loss": 0.4347, "rewards/accuracies": 1.0, "rewards/chosen": 1.7247310876846313, "rewards/margins": 0.6079952716827393, "rewards/rejected": 1.116735816001892, "step": 3505 }, { "epoch": 1.89, "learning_rate": 5.682479307162377e-08, "logits/chosen": -2.0789003372192383, "logits/rejected": -2.104445695877075, "logps/chosen": -23.706939697265625, "logps/rejected": -8.697484016418457, "loss": 0.3292, "rewards/accuracies": 1.0, "rewards/chosen": 1.7893608808517456, "rewards/margins": 0.9418103694915771, "rewards/rejected": 0.8475505113601685, "step": 3506 }, { "epoch": 1.89, "learning_rate": 5.680315896273087e-08, "logits/chosen": -2.148270845413208, "logits/rejected": -2.194362163543701, "logps/chosen": -5.548172473907471, "logps/rejected": -22.63215446472168, "loss": 0.4213, "rewards/accuracies": 1.0, "rewards/chosen": 1.3261736631393433, "rewards/margins": 0.646321177482605, "rewards/rejected": 0.6798524856567383, "step": 3507 }, { "epoch": 1.89, "learning_rate": 5.6781523556090373e-08, "logits/chosen": -2.0556657314300537, "logits/rejected": -2.3436007499694824, "logps/chosen": -0.6171280741691589, "logps/rejected": -0.7195884585380554, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 1.048740267753601, "rewards/margins": 0.03401529788970947, "rewards/rejected": 1.0147249698638916, "step": 3508 }, { "epoch": 1.89, "learning_rate": 5.6759886855829355e-08, "logits/chosen": -2.2171363830566406, "logits/rejected": -2.21779203414917, "logps/chosen": -0.5039788484573364, "logps/rejected": -3.6347310543060303, "loss": 0.4603, "rewards/accuracies": 1.0, "rewards/chosen": 0.987740159034729, "rewards/margins": 0.5368646383285522, "rewards/rejected": 0.45087549090385437, "step": 3509 }, { "epoch": 1.89, "learning_rate": 5.6738248866075176e-08, "logits/chosen": -2.117737293243408, "logits/rejected": -2.1125643253326416, "logps/chosen": -2.976100206375122, "logps/rejected": -2.2898197174072266, "loss": 0.3355, "rewards/accuracies": 1.0, "rewards/chosen": 1.5532017946243286, "rewards/margins": 0.9198284149169922, "rewards/rejected": 0.6333733797073364, "step": 3510 }, { "epoch": 1.89, "learning_rate": 5.671660959095542e-08, "logits/chosen": -2.0795533657073975, "logits/rejected": -2.0879170894622803, "logps/chosen": -1.8074225187301636, "logps/rejected": -3.6745622158050537, "loss": 0.4262, "rewards/accuracies": 1.0, "rewards/chosen": 1.1210435628890991, "rewards/margins": 0.6323225498199463, "rewards/rejected": 0.48872098326683044, "step": 3511 }, { "epoch": 1.89, "learning_rate": 5.669496903459793e-08, "logits/chosen": -2.1073195934295654, "logits/rejected": -2.1221683025360107, "logps/chosen": -3.544950008392334, "logps/rejected": -4.893767356872559, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": 1.8736652135849, "rewards/margins": 1.1891391277313232, "rewards/rejected": 0.6845261454582214, "step": 3512 }, { "epoch": 1.89, "learning_rate": 5.6673327201130775e-08, "logits/chosen": -1.9690979719161987, "logits/rejected": -2.2569053173065186, "logps/chosen": -1.7662057876586914, "logps/rejected": -1.4961280822753906, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 0.7332428097724915, "rewards/margins": -0.008936107158660889, "rewards/rejected": 0.7421789169311523, "step": 3513 }, { "epoch": 1.9, "learning_rate": 5.6651684094682265e-08, "logits/chosen": -2.1118552684783936, "logits/rejected": -2.1831231117248535, "logps/chosen": -0.32777896523475647, "logps/rejected": -29.991037368774414, "loss": 0.2115, "rewards/accuracies": 1.0, "rewards/chosen": 0.9674126505851746, "rewards/margins": 1.4458775520324707, "rewards/rejected": -0.47846490144729614, "step": 3514 }, { "epoch": 1.9, "learning_rate": 5.6630039719381006e-08, "logits/chosen": -2.0566790103912354, "logits/rejected": -2.0469141006469727, "logps/chosen": -5.102618217468262, "logps/rejected": -2.1693408489227295, "loss": 0.3961, "rewards/accuracies": 1.0, "rewards/chosen": 1.6430103778839111, "rewards/margins": 0.721515417098999, "rewards/rejected": 0.9214949607849121, "step": 3515 }, { "epoch": 1.9, "learning_rate": 5.660839407935577e-08, "logits/chosen": -2.048362970352173, "logits/rejected": -2.0425596237182617, "logps/chosen": -3.428772211074829, "logps/rejected": -3.942570209503174, "loss": 0.3924, "rewards/accuracies": 1.0, "rewards/chosen": 1.3776220083236694, "rewards/margins": 0.7329620718955994, "rewards/rejected": 0.6446599364280701, "step": 3516 }, { "epoch": 1.9, "learning_rate": 5.6586747178735616e-08, "logits/chosen": -2.0685038566589355, "logits/rejected": -2.293363332748413, "logps/chosen": -5.600905895233154, "logps/rejected": -1.4615530967712402, "loss": 0.7061, "rewards/accuracies": 0.0, "rewards/chosen": 1.0313631296157837, "rewards/margins": -0.02568340301513672, "rewards/rejected": 1.0570465326309204, "step": 3517 }, { "epoch": 1.9, "learning_rate": 5.6565099021649844e-08, "logits/chosen": -1.9694002866744995, "logits/rejected": -1.9676167964935303, "logps/chosen": -0.5159682035446167, "logps/rejected": -2.7078700065612793, "loss": 0.5929, "rewards/accuracies": 1.0, "rewards/chosen": 0.9808452725410461, "rewards/margins": 0.21157467365264893, "rewards/rejected": 0.7692705988883972, "step": 3518 }, { "epoch": 1.9, "learning_rate": 5.654344961222797e-08, "logits/chosen": -2.021930456161499, "logits/rejected": -2.0167174339294434, "logps/chosen": -2.72336483001709, "logps/rejected": -3.813547134399414, "loss": 0.6167, "rewards/accuracies": 1.0, "rewards/chosen": 1.110640048980713, "rewards/margins": 0.15913933515548706, "rewards/rejected": 0.9515007138252258, "step": 3519 }, { "epoch": 1.9, "learning_rate": 5.6521798954599756e-08, "logits/chosen": -2.0294783115386963, "logits/rejected": -2.021486282348633, "logps/chosen": -2.1904327869415283, "logps/rejected": -5.128512382507324, "loss": 0.3477, "rewards/accuracies": 1.0, "rewards/chosen": 1.309477686882019, "rewards/margins": 0.8774541616439819, "rewards/rejected": 0.4320235252380371, "step": 3520 }, { "epoch": 1.9, "learning_rate": 5.650014705289522e-08, "logits/chosen": -2.1155707836151123, "logits/rejected": -2.1203062534332275, "logps/chosen": -2.3111648559570312, "logps/rejected": -7.092585563659668, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": 1.3911422491073608, "rewards/margins": 0.7220456004142761, "rewards/rejected": 0.6690966486930847, "step": 3521 }, { "epoch": 1.9, "learning_rate": 5.6478493911244615e-08, "logits/chosen": -2.008981704711914, "logits/rejected": -2.2788002490997314, "logps/chosen": -0.834830105304718, "logps/rejected": -0.9859533905982971, "loss": 0.6747, "rewards/accuracies": 1.0, "rewards/chosen": 0.907157838344574, "rewards/margins": 0.03724837303161621, "rewards/rejected": 0.8699094653129578, "step": 3522 }, { "epoch": 1.9, "learning_rate": 5.645683953377841e-08, "logits/chosen": -1.9384287595748901, "logits/rejected": -1.9377015829086304, "logps/chosen": -0.5122095942497253, "logps/rejected": -2.027550220489502, "loss": 0.6576, "rewards/accuracies": 1.0, "rewards/chosen": 0.8773903846740723, "rewards/margins": 0.07244652509689331, "rewards/rejected": 0.804943859577179, "step": 3523 }, { "epoch": 1.9, "learning_rate": 5.643518392462734e-08, "logits/chosen": -2.1141204833984375, "logits/rejected": -2.1146152019500732, "logps/chosen": -1.161662220954895, "logps/rejected": -3.40966534614563, "loss": 0.512, "rewards/accuracies": 1.0, "rewards/chosen": 0.9881617426872253, "rewards/margins": 0.4025537371635437, "rewards/rejected": 0.5856080055236816, "step": 3524 }, { "epoch": 1.9, "learning_rate": 5.6413527087922294e-08, "logits/chosen": -2.1144893169403076, "logits/rejected": -2.1201958656311035, "logps/chosen": -4.129276275634766, "logps/rejected": -3.9331729412078857, "loss": 0.4414, "rewards/accuracies": 1.0, "rewards/chosen": 1.1810945272445679, "rewards/margins": 0.589128851890564, "rewards/rejected": 0.5919656753540039, "step": 3525 }, { "epoch": 1.9, "learning_rate": 5.6391869027794533e-08, "logits/chosen": -2.0385897159576416, "logits/rejected": -2.0455210208892822, "logps/chosen": -1.702377200126648, "logps/rejected": -2.988225221633911, "loss": 0.4727, "rewards/accuracies": 1.0, "rewards/chosen": 1.0555394887924194, "rewards/margins": 0.5037650465965271, "rewards/rejected": 0.5517744421958923, "step": 3526 }, { "epoch": 1.9, "learning_rate": 5.637020974837543e-08, "logits/chosen": -2.164003849029541, "logits/rejected": -2.129681348800659, "logps/chosen": -4.842970848083496, "logps/rejected": -8.331809997558594, "loss": 0.2756, "rewards/accuracies": 1.0, "rewards/chosen": 1.3098450899124146, "rewards/margins": 1.1477088928222656, "rewards/rejected": 0.16213618218898773, "step": 3527 }, { "epoch": 1.9, "learning_rate": 5.634854925379666e-08, "logits/chosen": -2.014502763748169, "logits/rejected": -2.3764712810516357, "logps/chosen": -8.260600090026855, "logps/rejected": -14.378615379333496, "loss": 0.9297, "rewards/accuracies": 0.0, "rewards/chosen": 0.8219742774963379, "rewards/margins": -0.4277973175048828, "rewards/rejected": 1.2497715950012207, "step": 3528 }, { "epoch": 1.9, "learning_rate": 5.632688754819008e-08, "logits/chosen": -2.007894515991211, "logits/rejected": -2.0035197734832764, "logps/chosen": -4.040560722351074, "logps/rejected": -2.0312998294830322, "loss": 0.442, "rewards/accuracies": 1.0, "rewards/chosen": 1.3979333639144897, "rewards/margins": 0.5873460173606873, "rewards/rejected": 0.8105873465538025, "step": 3529 }, { "epoch": 1.9, "learning_rate": 5.630522463568783e-08, "logits/chosen": -2.1051461696624756, "logits/rejected": -2.1034164428710938, "logps/chosen": -1.6635404825210571, "logps/rejected": -8.602385520935059, "loss": 0.3891, "rewards/accuracies": 1.0, "rewards/chosen": 0.95630943775177, "rewards/margins": 0.7431942224502563, "rewards/rejected": 0.21311521530151367, "step": 3530 }, { "epoch": 1.9, "learning_rate": 5.6283560520422246e-08, "logits/chosen": -2.1586050987243652, "logits/rejected": -2.1546263694763184, "logps/chosen": -2.7216551303863525, "logps/rejected": -5.72296142578125, "loss": 0.6048, "rewards/accuracies": 1.0, "rewards/chosen": 0.9198784232139587, "rewards/margins": 0.18536055088043213, "rewards/rejected": 0.7345178723335266, "step": 3531 }, { "epoch": 1.91, "learning_rate": 5.626189520652589e-08, "logits/chosen": -1.9883674383163452, "logits/rejected": -2.2370152473449707, "logps/chosen": -0.47324320673942566, "logps/rejected": -0.513300895690918, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.9592952728271484, "rewards/margins": -0.0008450746536254883, "rewards/rejected": 0.9601403474807739, "step": 3532 }, { "epoch": 1.91, "learning_rate": 5.624022869813156e-08, "logits/chosen": -1.9985644817352295, "logits/rejected": -2.007297992706299, "logps/chosen": -3.2533087730407715, "logps/rejected": -1.6302814483642578, "loss": 0.622, "rewards/accuracies": 1.0, "rewards/chosen": 1.0884348154067993, "rewards/margins": 0.14778614044189453, "rewards/rejected": 0.9406486749649048, "step": 3533 }, { "epoch": 1.91, "learning_rate": 5.621856099937231e-08, "logits/chosen": -2.0444962978363037, "logits/rejected": -2.283060073852539, "logps/chosen": -0.9532325267791748, "logps/rejected": -1.3205593824386597, "loss": 0.6641, "rewards/accuracies": 1.0, "rewards/chosen": 0.8345944285392761, "rewards/margins": 0.0589599609375, "rewards/rejected": 0.7756344676017761, "step": 3534 }, { "epoch": 1.91, "learning_rate": 5.6196892114381386e-08, "logits/chosen": -2.029085874557495, "logits/rejected": -2.294849395751953, "logps/chosen": -0.7827205657958984, "logps/rejected": -0.722453773021698, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 1.051788568496704, "rewards/margins": 0.023320674896240234, "rewards/rejected": 1.0284678936004639, "step": 3535 }, { "epoch": 1.91, "learning_rate": 5.617522204729227e-08, "logits/chosen": -2.162292718887329, "logits/rejected": -2.0784695148468018, "logps/chosen": -35.4454345703125, "logps/rejected": -9.292102813720703, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": 2.196927309036255, "rewards/margins": 1.2558751106262207, "rewards/rejected": 0.941052258014679, "step": 3536 }, { "epoch": 1.91, "learning_rate": 5.615355080223866e-08, "logits/chosen": -2.04610276222229, "logits/rejected": -2.0531609058380127, "logps/chosen": -1.4290438890457153, "logps/rejected": -5.397918701171875, "loss": 0.4454, "rewards/accuracies": 1.0, "rewards/chosen": 1.1593940258026123, "rewards/margins": 0.5779024362564087, "rewards/rejected": 0.5814915895462036, "step": 3537 }, { "epoch": 1.91, "learning_rate": 5.61318783833545e-08, "logits/chosen": -1.92180597782135, "logits/rejected": -2.3326282501220703, "logps/chosen": -5.234292984008789, "logps/rejected": -4.259895324707031, "loss": 0.6517, "rewards/accuracies": 1.0, "rewards/chosen": 0.8858518600463867, "rewards/margins": 0.08469235897064209, "rewards/rejected": 0.8011595010757446, "step": 3538 }, { "epoch": 1.91, "learning_rate": 5.6110204794773975e-08, "logits/chosen": -2.0131571292877197, "logits/rejected": -2.021597385406494, "logps/chosen": -1.5024535655975342, "logps/rejected": -2.8318333625793457, "loss": 0.5032, "rewards/accuracies": 1.0, "rewards/chosen": 1.0764596462249756, "rewards/margins": 0.4246203303337097, "rewards/rejected": 0.6518393158912659, "step": 3539 }, { "epoch": 1.91, "learning_rate": 5.608853004063141e-08, "logits/chosen": -2.082420587539673, "logits/rejected": -2.0738773345947266, "logps/chosen": -6.019118785858154, "logps/rejected": -4.121924877166748, "loss": 0.4288, "rewards/accuracies": 1.0, "rewards/chosen": 1.2611192464828491, "rewards/margins": 0.6246321797370911, "rewards/rejected": 0.6364870667457581, "step": 3540 }, { "epoch": 1.91, "learning_rate": 5.6066854125061445e-08, "logits/chosen": -2.0415611267089844, "logits/rejected": -2.0390357971191406, "logps/chosen": -5.541471004486084, "logps/rejected": -4.109943389892578, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 1.1600240468978882, "rewards/margins": 0.6512162685394287, "rewards/rejected": 0.5088077783584595, "step": 3541 }, { "epoch": 1.91, "learning_rate": 5.604517705219889e-08, "logits/chosen": -2.162174701690674, "logits/rejected": -2.1549036502838135, "logps/chosen": -3.4020843505859375, "logps/rejected": -7.968546390533447, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": 1.4700459241867065, "rewards/margins": 1.1765702962875366, "rewards/rejected": 0.29347559809684753, "step": 3542 }, { "epoch": 1.91, "learning_rate": 5.6023498826178796e-08, "logits/chosen": -1.9742333889007568, "logits/rejected": -1.9871279001235962, "logps/chosen": -1.3260778188705444, "logps/rejected": -7.601970672607422, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 1.3291901350021362, "rewards/margins": 0.7939061522483826, "rewards/rejected": 0.5352839827537537, "step": 3543 }, { "epoch": 1.91, "learning_rate": 5.6001819451136424e-08, "logits/chosen": -2.064272403717041, "logits/rejected": -2.0733871459960938, "logps/chosen": -2.923985242843628, "logps/rejected": -1.3568400144577026, "loss": 0.4404, "rewards/accuracies": 1.0, "rewards/chosen": 1.3281000852584839, "rewards/margins": 0.5917665362358093, "rewards/rejected": 0.7363335490226746, "step": 3544 }, { "epoch": 1.91, "learning_rate": 5.598013893120726e-08, "logits/chosen": -1.9755516052246094, "logits/rejected": -1.9734265804290771, "logps/chosen": -1.7822532653808594, "logps/rejected": -4.90472412109375, "loss": 0.4839, "rewards/accuracies": 1.0, "rewards/chosen": 1.2699130773544312, "rewards/margins": 0.47426050901412964, "rewards/rejected": 0.7956525683403015, "step": 3545 }, { "epoch": 1.91, "learning_rate": 5.5958457270527024e-08, "logits/chosen": -2.077033042907715, "logits/rejected": -2.26471209526062, "logps/chosen": -5.730521202087402, "logps/rejected": -1.2900080680847168, "loss": 0.7128, "rewards/accuracies": 0.0, "rewards/chosen": 0.6763169169425964, "rewards/margins": -0.03898388147354126, "rewards/rejected": 0.7153007984161377, "step": 3546 }, { "epoch": 1.91, "learning_rate": 5.593677447323163e-08, "logits/chosen": -2.0381393432617188, "logits/rejected": -2.2853806018829346, "logps/chosen": -0.7413297891616821, "logps/rejected": -0.6234229803085327, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.888933002948761, "rewards/margins": 0.010833144187927246, "rewards/rejected": 0.8780998587608337, "step": 3547 }, { "epoch": 1.91, "learning_rate": 5.591509054345716e-08, "logits/chosen": -2.061610698699951, "logits/rejected": -2.0558273792266846, "logps/chosen": -14.386872291564941, "logps/rejected": -9.569031715393066, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 1.439121127128601, "rewards/margins": 1.2048695087432861, "rewards/rejected": 0.23425160348415375, "step": 3548 }, { "epoch": 1.91, "learning_rate": 5.589340548534005e-08, "logits/chosen": -2.023247480392456, "logits/rejected": -2.0301551818847656, "logps/chosen": -1.5124545097351074, "logps/rejected": -3.0208122730255127, "loss": 0.4714, "rewards/accuracies": 1.0, "rewards/chosen": 1.0777019262313843, "rewards/margins": 0.5070487856864929, "rewards/rejected": 0.5706531405448914, "step": 3549 }, { "epoch": 1.91, "learning_rate": 5.587171930301682e-08, "logits/chosen": -2.1955554485321045, "logits/rejected": -2.2658848762512207, "logps/chosen": -1.5228415727615356, "logps/rejected": -1.4567400217056274, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.9723720550537109, "rewards/margins": 0.008143305778503418, "rewards/rejected": 0.9642287492752075, "step": 3550 }, { "epoch": 1.92, "learning_rate": 5.585003200062427e-08, "logits/chosen": -1.9592442512512207, "logits/rejected": -2.254307508468628, "logps/chosen": -0.8924710154533386, "logps/rejected": -1.0166369676589966, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.8469900488853455, "rewards/margins": 0.026688814163208008, "rewards/rejected": 0.8203012347221375, "step": 3551 }, { "epoch": 1.92, "learning_rate": 5.5828343582299386e-08, "logits/chosen": -2.0261192321777344, "logits/rejected": -2.0314877033233643, "logps/chosen": -1.405945897102356, "logps/rejected": -3.1149353981018066, "loss": 0.484, "rewards/accuracies": 1.0, "rewards/chosen": 1.0810407400131226, "rewards/margins": 0.4739038348197937, "rewards/rejected": 0.6071369051933289, "step": 3552 }, { "epoch": 1.92, "learning_rate": 5.580665405217938e-08, "logits/chosen": -1.9885733127593994, "logits/rejected": -2.268176555633545, "logps/chosen": -0.6531713008880615, "logps/rejected": -0.7122986316680908, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.8890512585639954, "rewards/margins": 0.029593348503112793, "rewards/rejected": 0.8594579100608826, "step": 3553 }, { "epoch": 1.92, "learning_rate": 5.5784963414401676e-08, "logits/chosen": -2.0865283012390137, "logits/rejected": -2.342317819595337, "logps/chosen": -2.19407320022583, "logps/rejected": -1.897362470626831, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 1.077417254447937, "rewards/margins": 0.033469438552856445, "rewards/rejected": 1.0439478158950806, "step": 3554 }, { "epoch": 1.92, "learning_rate": 5.576327167310391e-08, "logits/chosen": -1.9927898645401, "logits/rejected": -2.285832405090332, "logps/chosen": -0.9051976203918457, "logps/rejected": -1.0735232830047607, "loss": 0.6653, "rewards/accuracies": 1.0, "rewards/chosen": 0.7665245532989502, "rewards/margins": 0.05641055107116699, "rewards/rejected": 0.7101140022277832, "step": 3555 }, { "epoch": 1.92, "learning_rate": 5.574157883242392e-08, "logits/chosen": -2.040299892425537, "logits/rejected": -2.2831809520721436, "logps/chosen": -0.4317118525505066, "logps/rejected": -0.5528035163879395, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 1.0617858171463013, "rewards/margins": 0.017068028450012207, "rewards/rejected": 1.044717788696289, "step": 3556 }, { "epoch": 1.92, "learning_rate": 5.571988489649976e-08, "logits/chosen": -2.079909086227417, "logits/rejected": -2.081141233444214, "logps/chosen": -2.5739941596984863, "logps/rejected": -2.644866943359375, "loss": 0.3494, "rewards/accuracies": 1.0, "rewards/chosen": 1.778586983680725, "rewards/margins": 0.8718823790550232, "rewards/rejected": 0.9067046046257019, "step": 3557 }, { "epoch": 1.92, "learning_rate": 5.569818986946969e-08, "logits/chosen": -2.0970020294189453, "logits/rejected": -2.0977470874786377, "logps/chosen": -1.2293517589569092, "logps/rejected": -5.673381328582764, "loss": 0.4172, "rewards/accuracies": 1.0, "rewards/chosen": 1.026713490486145, "rewards/margins": 0.6582738161087036, "rewards/rejected": 0.368439644575119, "step": 3558 }, { "epoch": 1.92, "learning_rate": 5.5676493755472176e-08, "logits/chosen": -2.0379090309143066, "logits/rejected": -2.093897581100464, "logps/chosen": -2.920254707336426, "logps/rejected": -23.554874420166016, "loss": 0.3643, "rewards/accuracies": 1.0, "rewards/chosen": 1.0944161415100098, "rewards/margins": 0.8219971656799316, "rewards/rejected": 0.2724189758300781, "step": 3559 }, { "epoch": 1.92, "learning_rate": 5.565479655864591e-08, "logits/chosen": -2.0841526985168457, "logits/rejected": -2.29768443107605, "logps/chosen": -3.162118673324585, "logps/rejected": -7.26962947845459, "loss": 0.7436, "rewards/accuracies": 0.0, "rewards/chosen": 1.0470308065414429, "rewards/margins": -0.09847736358642578, "rewards/rejected": 1.1455081701278687, "step": 3560 }, { "epoch": 1.92, "learning_rate": 5.5633098283129766e-08, "logits/chosen": -2.046377420425415, "logits/rejected": -2.0467448234558105, "logps/chosen": -3.7571253776550293, "logps/rejected": -4.16413688659668, "loss": 0.5092, "rewards/accuracies": 1.0, "rewards/chosen": 1.1139814853668213, "rewards/margins": 0.409481942653656, "rewards/rejected": 0.7044995427131653, "step": 3561 }, { "epoch": 1.92, "learning_rate": 5.561139893306285e-08, "logits/chosen": -2.0620360374450684, "logits/rejected": -2.2589023113250732, "logps/chosen": -0.3798796832561493, "logps/rejected": -0.34177669882774353, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.8334580659866333, "rewards/margins": 0.02381831407546997, "rewards/rejected": 0.8096397519111633, "step": 3562 }, { "epoch": 1.92, "learning_rate": 5.5589698512584416e-08, "logits/chosen": -1.9993231296539307, "logits/rejected": -2.0079867839813232, "logps/chosen": -1.4887442588806152, "logps/rejected": -3.2827908992767334, "loss": 0.4672, "rewards/accuracies": 1.0, "rewards/chosen": 1.0204843282699585, "rewards/margins": 0.5182015895843506, "rewards/rejected": 0.5022827386856079, "step": 3563 }, { "epoch": 1.92, "learning_rate": 5.5567997025834e-08, "logits/chosen": -2.0781166553497314, "logits/rejected": -2.2903735637664795, "logps/chosen": -0.6905509829521179, "logps/rejected": -0.5979124903678894, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.9832615256309509, "rewards/margins": -0.001371920108795166, "rewards/rejected": 0.9846334457397461, "step": 3564 }, { "epoch": 1.92, "learning_rate": 5.5546294476951283e-08, "logits/chosen": -1.9809321165084839, "logits/rejected": -2.2919912338256836, "logps/chosen": -2.35705828666687, "logps/rejected": -11.21920394897461, "loss": 0.5955, "rewards/accuracies": 1.0, "rewards/chosen": 0.9342414736747742, "rewards/margins": 0.2059289813041687, "rewards/rejected": 0.7283124923706055, "step": 3565 }, { "epoch": 1.92, "learning_rate": 5.552459087007619e-08, "logits/chosen": -2.070392370223999, "logits/rejected": -2.257589817047119, "logps/chosen": -7.409225940704346, "logps/rejected": -9.241772651672363, "loss": 0.5924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9459339380264282, "rewards/margins": 0.21279019117355347, "rewards/rejected": 0.7331437468528748, "step": 3566 }, { "epoch": 1.92, "learning_rate": 5.550288620934882e-08, "logits/chosen": -2.196547508239746, "logits/rejected": -2.205414056777954, "logps/chosen": -2.6390976905822754, "logps/rejected": -6.177763938903809, "loss": 0.421, "rewards/accuracies": 1.0, "rewards/chosen": 0.8580846786499023, "rewards/margins": 0.6472762227058411, "rewards/rejected": 0.21080847084522247, "step": 3567 }, { "epoch": 1.92, "learning_rate": 5.548118049890948e-08, "logits/chosen": -1.995440125465393, "logits/rejected": -2.257542371749878, "logps/chosen": -0.5120357871055603, "logps/rejected": -0.525400698184967, "loss": 0.6724, "rewards/accuracies": 1.0, "rewards/chosen": 1.0433135032653809, "rewards/margins": 0.04186069965362549, "rewards/rejected": 1.0014528036117554, "step": 3568 }, { "epoch": 1.93, "learning_rate": 5.5459473742898666e-08, "logits/chosen": -2.0870463848114014, "logits/rejected": -2.0762484073638916, "logps/chosen": -0.3103162348270416, "logps/rejected": -7.409613132476807, "loss": 0.4122, "rewards/accuracies": 1.0, "rewards/chosen": 1.0201371908187866, "rewards/margins": 0.6731674671173096, "rewards/rejected": 0.34696975350379944, "step": 3569 }, { "epoch": 1.93, "learning_rate": 5.54377659454571e-08, "logits/chosen": -1.9880061149597168, "logits/rejected": -1.9664207696914673, "logps/chosen": -10.379344940185547, "logps/rejected": -3.1365232467651367, "loss": 0.4914, "rewards/accuracies": 1.0, "rewards/chosen": 1.5190414190292358, "rewards/margins": 0.4548492431640625, "rewards/rejected": 1.0641921758651733, "step": 3570 }, { "epoch": 1.93, "learning_rate": 5.541605711072569e-08, "logits/chosen": -2.152667760848999, "logits/rejected": -2.1586101055145264, "logps/chosen": -2.794283390045166, "logps/rejected": -1.7396785020828247, "loss": 0.6484, "rewards/accuracies": 1.0, "rewards/chosen": 0.9710363745689392, "rewards/margins": 0.09150820970535278, "rewards/rejected": 0.8795281648635864, "step": 3571 }, { "epoch": 1.93, "learning_rate": 5.539434724284553e-08, "logits/chosen": -2.1730754375457764, "logits/rejected": -2.2796125411987305, "logps/chosen": -1.535827875137329, "logps/rejected": -1.98128342628479, "loss": 0.6614, "rewards/accuracies": 1.0, "rewards/chosen": 1.0486698150634766, "rewards/margins": 0.06445366144180298, "rewards/rejected": 0.9842161536216736, "step": 3572 }, { "epoch": 1.93, "learning_rate": 5.537263634595793e-08, "logits/chosen": -2.0615651607513428, "logits/rejected": -2.20974063873291, "logps/chosen": -0.3093990683555603, "logps/rejected": -0.32055267691612244, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8225873112678528, "rewards/margins": 0.019830822944641113, "rewards/rejected": 0.8027564883232117, "step": 3573 }, { "epoch": 1.93, "learning_rate": 5.535092442420437e-08, "logits/chosen": -2.0067219734191895, "logits/rejected": -2.007662534713745, "logps/chosen": -0.8513359427452087, "logps/rejected": -3.6467206478118896, "loss": 0.5308, "rewards/accuracies": 1.0, "rewards/chosen": 0.971467912197113, "rewards/margins": 0.35622239112854004, "rewards/rejected": 0.615245521068573, "step": 3574 }, { "epoch": 1.93, "learning_rate": 5.532921148172656e-08, "logits/chosen": -2.1623408794403076, "logits/rejected": -2.197246789932251, "logps/chosen": -0.6280813813209534, "logps/rejected": -7.811491966247559, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 1.0125778913497925, "rewards/margins": 0.007927656173706055, "rewards/rejected": 1.0046502351760864, "step": 3575 }, { "epoch": 1.93, "learning_rate": 5.530749752266637e-08, "logits/chosen": -2.127497911453247, "logits/rejected": -2.1290667057037354, "logps/chosen": -0.2812548279762268, "logps/rejected": -4.53927755355835, "loss": 0.4909, "rewards/accuracies": 1.0, "rewards/chosen": 0.9826798439025879, "rewards/margins": 0.4560862183570862, "rewards/rejected": 0.5265936255455017, "step": 3576 }, { "epoch": 1.93, "learning_rate": 5.528578255116591e-08, "logits/chosen": -2.107893466949463, "logits/rejected": -2.2923591136932373, "logps/chosen": -0.44594550132751465, "logps/rejected": -0.42632603645324707, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9420632719993591, "rewards/margins": 0.006765484809875488, "rewards/rejected": 0.9352977871894836, "step": 3577 }, { "epoch": 1.93, "learning_rate": 5.5264066571367443e-08, "logits/chosen": -2.0895304679870605, "logits/rejected": -2.382797956466675, "logps/chosen": -19.35395050048828, "logps/rejected": -16.68227767944336, "loss": 0.6151, "rewards/accuracies": 1.0, "rewards/chosen": 0.28816109895706177, "rewards/margins": 0.162791445851326, "rewards/rejected": 0.12536965310573578, "step": 3578 }, { "epoch": 1.93, "learning_rate": 5.524234958741342e-08, "logits/chosen": -2.063263177871704, "logits/rejected": -2.063138008117676, "logps/chosen": -0.18215498328208923, "logps/rejected": -5.757275581359863, "loss": 0.4514, "rewards/accuracies": 1.0, "rewards/chosen": 0.9333397746086121, "rewards/margins": 0.5612853765487671, "rewards/rejected": 0.37205439805984497, "step": 3579 }, { "epoch": 1.93, "learning_rate": 5.522063160344651e-08, "logits/chosen": -2.1339271068573, "logits/rejected": -2.2926292419433594, "logps/chosen": -0.28268787264823914, "logps/rejected": -0.31595122814178467, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.8089914321899414, "rewards/margins": -0.00860130786895752, "rewards/rejected": 0.8175927400588989, "step": 3580 }, { "epoch": 1.93, "learning_rate": 5.519891262360956e-08, "logits/chosen": -2.062347888946533, "logits/rejected": -2.059978723526001, "logps/chosen": -2.1392791271209717, "logps/rejected": -5.754088401794434, "loss": 0.2693, "rewards/accuracies": 1.0, "rewards/chosen": 1.5398868322372437, "rewards/margins": 1.1744253635406494, "rewards/rejected": 0.36546143889427185, "step": 3581 }, { "epoch": 1.93, "learning_rate": 5.517719265204561e-08, "logits/chosen": -2.228606700897217, "logits/rejected": -2.1959550380706787, "logps/chosen": -24.249217987060547, "logps/rejected": -11.458085060119629, "loss": 0.3715, "rewards/accuracies": 1.0, "rewards/chosen": 1.8688801527023315, "rewards/margins": 0.7987798452377319, "rewards/rejected": 1.0701003074645996, "step": 3582 }, { "epoch": 1.93, "learning_rate": 5.515547169289789e-08, "logits/chosen": -2.0529098510742188, "logits/rejected": -2.2489020824432373, "logps/chosen": -1.379016637802124, "logps/rejected": -1.4027544260025024, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.853720486164093, "rewards/margins": 0.02595198154449463, "rewards/rejected": 0.8277685046195984, "step": 3583 }, { "epoch": 1.93, "learning_rate": 5.513374975030983e-08, "logits/chosen": -2.194214105606079, "logits/rejected": -2.2040584087371826, "logps/chosen": -2.41658091545105, "logps/rejected": -4.555185794830322, "loss": 0.3416, "rewards/accuracies": 1.0, "rewards/chosen": 1.3701865673065186, "rewards/margins": 0.8986208438873291, "rewards/rejected": 0.47156569361686707, "step": 3584 }, { "epoch": 1.93, "learning_rate": 5.511202682842503e-08, "logits/chosen": -2.0779106616973877, "logits/rejected": -2.3497815132141113, "logps/chosen": -0.5897924900054932, "logps/rejected": -0.6337481737136841, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 1.1569263935089111, "rewards/margins": 0.013787984848022461, "rewards/rejected": 1.1431384086608887, "step": 3585 }, { "epoch": 1.93, "learning_rate": 5.5090302931387254e-08, "logits/chosen": -2.043447971343994, "logits/rejected": -2.2483773231506348, "logps/chosen": -0.3430750072002411, "logps/rejected": -4.887028217315674, "loss": 0.5565, "rewards/accuracies": 1.0, "rewards/chosen": 0.9438452124595642, "rewards/margins": 0.29502439498901367, "rewards/rejected": 0.6488208174705505, "step": 3586 }, { "epoch": 1.93, "learning_rate": 5.506857806334049e-08, "logits/chosen": -2.0742805004119873, "logits/rejected": -2.0633559226989746, "logps/chosen": -4.292989730834961, "logps/rejected": -3.4175424575805664, "loss": 0.4973, "rewards/accuracies": 1.0, "rewards/chosen": 1.1185418367385864, "rewards/margins": 0.4396907091140747, "rewards/rejected": 0.6788511276245117, "step": 3587 }, { "epoch": 1.94, "learning_rate": 5.504685222842892e-08, "logits/chosen": -2.0290110111236572, "logits/rejected": -2.2563483715057373, "logps/chosen": -9.361058235168457, "logps/rejected": -5.44539737701416, "loss": 0.7906, "rewards/accuracies": 0.0, "rewards/chosen": 0.8340689539909363, "rewards/margins": -0.18623369932174683, "rewards/rejected": 1.020302653312683, "step": 3588 }, { "epoch": 1.94, "learning_rate": 5.502512543079688e-08, "logits/chosen": -2.0759146213531494, "logits/rejected": -2.072016954421997, "logps/chosen": -3.577786445617676, "logps/rejected": -2.9811437129974365, "loss": 0.5728, "rewards/accuracies": 1.0, "rewards/chosen": 1.0895485877990723, "rewards/margins": 0.25709617137908936, "rewards/rejected": 0.8324524164199829, "step": 3589 }, { "epoch": 1.94, "learning_rate": 5.5003397674588895e-08, "logits/chosen": -2.134760618209839, "logits/rejected": -2.3003499507904053, "logps/chosen": -0.8962617516517639, "logps/rejected": -0.8480983972549438, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9663345217704773, "rewards/margins": 0.01828169822692871, "rewards/rejected": 0.9480528235435486, "step": 3590 }, { "epoch": 1.94, "learning_rate": 5.498166896394969e-08, "logits/chosen": -2.124143600463867, "logits/rejected": -2.13777494430542, "logps/chosen": -2.6923489570617676, "logps/rejected": -5.588944911956787, "loss": 0.4348, "rewards/accuracies": 1.0, "rewards/chosen": 1.4041528701782227, "rewards/margins": 0.6076897978782654, "rewards/rejected": 0.7964630722999573, "step": 3591 }, { "epoch": 1.94, "learning_rate": 5.495993930302414e-08, "logits/chosen": -2.1393885612487793, "logits/rejected": -2.1422555446624756, "logps/chosen": -1.0846341848373413, "logps/rejected": -3.827322483062744, "loss": 0.4632, "rewards/accuracies": 1.0, "rewards/chosen": 1.0056346654891968, "rewards/margins": 0.5289543271064758, "rewards/rejected": 0.47668033838272095, "step": 3592 }, { "epoch": 1.94, "learning_rate": 5.493820869595735e-08, "logits/chosen": -2.040517568588257, "logits/rejected": -2.0394675731658936, "logps/chosen": -6.846787452697754, "logps/rejected": -6.074560642242432, "loss": 0.4544, "rewards/accuracies": 1.0, "rewards/chosen": 1.0356444120407104, "rewards/margins": 0.5529789924621582, "rewards/rejected": 0.48266538977622986, "step": 3593 }, { "epoch": 1.94, "learning_rate": 5.491647714689455e-08, "logits/chosen": -2.1770732402801514, "logits/rejected": -2.129483222961426, "logps/chosen": -17.06546974182129, "logps/rejected": -3.380016326904297, "loss": 0.3065, "rewards/accuracies": 1.0, "rewards/chosen": 1.6228078603744507, "rewards/margins": 1.0253820419311523, "rewards/rejected": 0.5974258780479431, "step": 3594 }, { "epoch": 1.94, "learning_rate": 5.489474465998119e-08, "logits/chosen": -2.1930429935455322, "logits/rejected": -2.190972089767456, "logps/chosen": -2.165647506713867, "logps/rejected": -3.5731353759765625, "loss": 0.5124, "rewards/accuracies": 1.0, "rewards/chosen": 1.113433599472046, "rewards/margins": 0.4016042947769165, "rewards/rejected": 0.7118293046951294, "step": 3595 }, { "epoch": 1.94, "learning_rate": 5.48730112393629e-08, "logits/chosen": -2.0399200916290283, "logits/rejected": -2.055734872817993, "logps/chosen": -1.3917641639709473, "logps/rejected": -11.529861450195312, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 1.2325493097305298, "rewards/margins": 0.4308490753173828, "rewards/rejected": 0.801700234413147, "step": 3596 }, { "epoch": 1.94, "learning_rate": 5.485127688918544e-08, "logits/chosen": -2.058516263961792, "logits/rejected": -2.274397373199463, "logps/chosen": -0.9720653891563416, "logps/rejected": -1.326521635055542, "loss": 0.7237, "rewards/accuracies": 0.0, "rewards/chosen": 0.8876609802246094, "rewards/margins": -0.060184597969055176, "rewards/rejected": 0.9478455781936646, "step": 3597 }, { "epoch": 1.94, "learning_rate": 5.48295416135948e-08, "logits/chosen": -2.2048678398132324, "logits/rejected": -2.2345263957977295, "logps/chosen": -0.7158111929893494, "logps/rejected": -9.557934761047363, "loss": 0.5193, "rewards/accuracies": 1.0, "rewards/chosen": 0.9336112141609192, "rewards/margins": 0.384507954120636, "rewards/rejected": 0.5491032600402832, "step": 3598 }, { "epoch": 1.94, "learning_rate": 5.480780541673713e-08, "logits/chosen": -1.974042296409607, "logits/rejected": -2.237959384918213, "logps/chosen": -0.4530504047870636, "logps/rejected": -0.45966383814811707, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.9543558359146118, "rewards/margins": 0.02943962812423706, "rewards/rejected": 0.9249162077903748, "step": 3599 }, { "epoch": 1.94, "learning_rate": 5.478606830275876e-08, "logits/chosen": -1.9876346588134766, "logits/rejected": -1.9918537139892578, "logps/chosen": -1.3774100542068481, "logps/rejected": -2.562788486480713, "loss": 0.568, "rewards/accuracies": 1.0, "rewards/chosen": 0.7948258519172668, "rewards/margins": 0.2682119607925415, "rewards/rejected": 0.5266138911247253, "step": 3600 }, { "epoch": 1.94, "learning_rate": 5.476433027580617e-08, "logits/chosen": -2.1266047954559326, "logits/rejected": -2.2594351768493652, "logps/chosen": -5.32330846786499, "logps/rejected": -6.4505510330200195, "loss": 0.5785, "rewards/accuracies": 1.0, "rewards/chosen": 1.0399259328842163, "rewards/margins": 0.24423545598983765, "rewards/rejected": 0.7956904768943787, "step": 3601 }, { "epoch": 1.94, "learning_rate": 5.4742591340026045e-08, "logits/chosen": -2.0285215377807617, "logits/rejected": -2.0351312160491943, "logps/chosen": -1.0173563957214355, "logps/rejected": -6.136363983154297, "loss": 0.4095, "rewards/accuracies": 1.0, "rewards/chosen": 0.9544100165367126, "rewards/margins": 0.6809588670730591, "rewards/rejected": 0.27345114946365356, "step": 3602 }, { "epoch": 1.94, "learning_rate": 5.4720851499565235e-08, "logits/chosen": -2.0603086948394775, "logits/rejected": -2.2381606101989746, "logps/chosen": -0.3537514805793762, "logps/rejected": -0.323258638381958, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8082044720649719, "rewards/margins": 0.017936408519744873, "rewards/rejected": 0.790268063545227, "step": 3603 }, { "epoch": 1.94, "learning_rate": 5.469911075857072e-08, "logits/chosen": -2.121830701828003, "logits/rejected": -2.320558786392212, "logps/chosen": -1.5058709383010864, "logps/rejected": -1.515589952468872, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": 0.9910416603088379, "rewards/margins": 0.040608227252960205, "rewards/rejected": 0.9504334330558777, "step": 3604 }, { "epoch": 1.94, "learning_rate": 5.467736912118975e-08, "logits/chosen": -2.0851352214813232, "logits/rejected": -2.0882434844970703, "logps/chosen": -1.3018954992294312, "logps/rejected": -10.926067352294922, "loss": 0.4413, "rewards/accuracies": 1.0, "rewards/chosen": 1.112728476524353, "rewards/margins": 0.5891490578651428, "rewards/rejected": 0.5235794186592102, "step": 3605 }, { "epoch": 1.94, "learning_rate": 5.4655626591569646e-08, "logits/chosen": -2.0296664237976074, "logits/rejected": -2.256809949874878, "logps/chosen": -1.4396100044250488, "logps/rejected": -1.5540106296539307, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.8887220621109009, "rewards/margins": 0.0038816332817077637, "rewards/rejected": 0.8848404288291931, "step": 3606 }, { "epoch": 1.95, "learning_rate": 5.463388317385796e-08, "logits/chosen": -2.04543137550354, "logits/rejected": -2.2388007640838623, "logps/chosen": -0.5588427782058716, "logps/rejected": -0.6702349185943604, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": 0.8137646913528442, "rewards/margins": -0.015712738037109375, "rewards/rejected": 0.8294774293899536, "step": 3607 }, { "epoch": 1.95, "learning_rate": 5.461213887220238e-08, "logits/chosen": -2.024503469467163, "logits/rejected": -2.0160977840423584, "logps/chosen": -4.321872234344482, "logps/rejected": -5.3946452140808105, "loss": 0.404, "rewards/accuracies": 1.0, "rewards/chosen": 1.2791264057159424, "rewards/margins": 0.697435200214386, "rewards/rejected": 0.5816912055015564, "step": 3608 }, { "epoch": 1.95, "learning_rate": 5.459039369075079e-08, "logits/chosen": -2.0210506916046143, "logits/rejected": -2.025775909423828, "logps/chosen": -1.6851521730422974, "logps/rejected": -3.883415699005127, "loss": 0.4839, "rewards/accuracies": 1.0, "rewards/chosen": 0.974743664264679, "rewards/margins": 0.47407543659210205, "rewards/rejected": 0.5006682276725769, "step": 3609 }, { "epoch": 1.95, "learning_rate": 5.45686476336512e-08, "logits/chosen": -2.0383434295654297, "logits/rejected": -2.038661241531372, "logps/chosen": -0.5386857390403748, "logps/rejected": -4.759622573852539, "loss": 0.5137, "rewards/accuracies": 1.0, "rewards/chosen": 0.8971733450889587, "rewards/margins": 0.39833396673202515, "rewards/rejected": 0.4988393783569336, "step": 3610 }, { "epoch": 1.95, "learning_rate": 5.4546900705051835e-08, "logits/chosen": -2.021036386489868, "logits/rejected": -2.015972375869751, "logps/chosen": -5.994866847991943, "logps/rejected": -4.0190558433532715, "loss": 0.2933, "rewards/accuracies": 1.0, "rewards/chosen": 1.6536158323287964, "rewards/margins": 1.0762367248535156, "rewards/rejected": 0.5773791074752808, "step": 3611 }, { "epoch": 1.95, "learning_rate": 5.4525152909101056e-08, "logits/chosen": -2.089172840118408, "logits/rejected": -2.0873212814331055, "logps/chosen": -0.16937920451164246, "logps/rejected": -6.77223014831543, "loss": 0.494, "rewards/accuracies": 1.0, "rewards/chosen": 1.0405349731445312, "rewards/margins": 0.4480059742927551, "rewards/rejected": 0.5925289988517761, "step": 3612 }, { "epoch": 1.95, "learning_rate": 5.450340424994742e-08, "logits/chosen": -2.249952793121338, "logits/rejected": -2.1318063735961914, "logps/chosen": -34.18424606323242, "logps/rejected": -5.168241500854492, "loss": 0.1436, "rewards/accuracies": 1.0, "rewards/chosen": 2.600782871246338, "rewards/margins": 1.867957353591919, "rewards/rejected": 0.7328254580497742, "step": 3613 }, { "epoch": 1.95, "learning_rate": 5.44816547317396e-08, "logits/chosen": -2.210864782333374, "logits/rejected": -2.2124850749969482, "logps/chosen": -2.2966432571411133, "logps/rejected": -1.0906305313110352, "loss": 0.5881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9825516939163208, "rewards/margins": 0.2224389910697937, "rewards/rejected": 0.7601127028465271, "step": 3614 }, { "epoch": 1.95, "learning_rate": 5.445990435862647e-08, "logits/chosen": -2.147650957107544, "logits/rejected": -2.1521430015563965, "logps/chosen": -7.498042106628418, "logps/rejected": -6.423391342163086, "loss": 0.6153, "rewards/accuracies": 1.0, "rewards/chosen": 1.32083261013031, "rewards/margins": 0.16216576099395752, "rewards/rejected": 1.1586668491363525, "step": 3615 }, { "epoch": 1.95, "learning_rate": 5.443815313475707e-08, "logits/chosen": -2.0820043087005615, "logits/rejected": -2.2852532863616943, "logps/chosen": -1.1551597118377686, "logps/rejected": -1.2207562923431396, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.7864789962768555, "rewards/margins": 0.005014240741729736, "rewards/rejected": 0.7814647555351257, "step": 3616 }, { "epoch": 1.95, "learning_rate": 5.4416401064280584e-08, "logits/chosen": -2.028430938720703, "logits/rejected": -2.025853157043457, "logps/chosen": -7.079080104827881, "logps/rejected": -5.270614147186279, "loss": 0.324, "rewards/accuracies": 1.0, "rewards/chosen": 1.3844565153121948, "rewards/margins": 0.9605071544647217, "rewards/rejected": 0.42394939064979553, "step": 3617 }, { "epoch": 1.95, "learning_rate": 5.4394648151346344e-08, "logits/chosen": -2.1327459812164307, "logits/rejected": -2.1437697410583496, "logps/chosen": -2.3994691371917725, "logps/rejected": -1.6701571941375732, "loss": 0.5416, "rewards/accuracies": 1.0, "rewards/chosen": 1.364667296409607, "rewards/margins": 0.33025693893432617, "rewards/rejected": 1.0344103574752808, "step": 3618 }, { "epoch": 1.95, "learning_rate": 5.43728944001039e-08, "logits/chosen": -1.9817605018615723, "logits/rejected": -1.9923640489578247, "logps/chosen": -3.0225892066955566, "logps/rejected": -4.54074764251709, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 1.0853712558746338, "rewards/margins": 0.6512479186058044, "rewards/rejected": 0.43412333726882935, "step": 3619 }, { "epoch": 1.95, "learning_rate": 5.435113981470289e-08, "logits/chosen": -2.0950372219085693, "logits/rejected": -2.281940460205078, "logps/chosen": -0.8563705682754517, "logps/rejected": -0.8733157515525818, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 1.0230259895324707, "rewards/margins": 0.01388239860534668, "rewards/rejected": 1.009143590927124, "step": 3620 }, { "epoch": 1.95, "learning_rate": 5.432938439929313e-08, "logits/chosen": -2.1461102962493896, "logits/rejected": -2.128586530685425, "logps/chosen": -7.180172920227051, "logps/rejected": -4.746945381164551, "loss": 0.397, "rewards/accuracies": 1.0, "rewards/chosen": 1.3265166282653809, "rewards/margins": 0.7187593579292297, "rewards/rejected": 0.6077572703361511, "step": 3621 }, { "epoch": 1.95, "learning_rate": 5.430762815802466e-08, "logits/chosen": -2.035813093185425, "logits/rejected": -2.0389866828918457, "logps/chosen": -3.300661087036133, "logps/rejected": -4.2526397705078125, "loss": 0.4928, "rewards/accuracies": 1.0, "rewards/chosen": 1.0393290519714355, "rewards/margins": 0.451177179813385, "rewards/rejected": 0.5881518721580505, "step": 3622 }, { "epoch": 1.95, "learning_rate": 5.428587109504762e-08, "logits/chosen": -2.0476090908050537, "logits/rejected": -2.222781181335449, "logps/chosen": -0.5960241556167603, "logps/rejected": -0.596552312374115, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.8430366516113281, "rewards/margins": 0.02906513214111328, "rewards/rejected": 0.8139715194702148, "step": 3623 }, { "epoch": 1.95, "learning_rate": 5.426411321451229e-08, "logits/chosen": -2.303165912628174, "logits/rejected": -2.150994300842285, "logps/chosen": -29.1363525390625, "logps/rejected": -3.8306469917297363, "loss": 0.1871, "rewards/accuracies": 1.0, "rewards/chosen": 1.9777153730392456, "rewards/margins": 1.5809051990509033, "rewards/rejected": 0.3968102037906647, "step": 3624 }, { "epoch": 1.96, "learning_rate": 5.4242354520569135e-08, "logits/chosen": -2.064094066619873, "logits/rejected": -2.324246883392334, "logps/chosen": -0.2825002670288086, "logps/rejected": -0.768959105014801, "loss": 0.657, "rewards/accuracies": 1.0, "rewards/chosen": 0.8257061243057251, "rewards/margins": 0.07373374700546265, "rewards/rejected": 0.7519723773002625, "step": 3625 }, { "epoch": 1.96, "learning_rate": 5.4220595017368754e-08, "logits/chosen": -2.05194354057312, "logits/rejected": -2.064537763595581, "logps/chosen": -1.4101152420043945, "logps/rejected": -7.270249843597412, "loss": 0.4288, "rewards/accuracies": 1.0, "rewards/chosen": 1.274774432182312, "rewards/margins": 0.6245919466018677, "rewards/rejected": 0.6501824855804443, "step": 3626 }, { "epoch": 1.96, "learning_rate": 5.419883470906195e-08, "logits/chosen": -2.106579065322876, "logits/rejected": -2.0743794441223145, "logps/chosen": -16.929821014404297, "logps/rejected": -3.3825156688690186, "loss": 0.3722, "rewards/accuracies": 1.0, "rewards/chosen": 1.4531835317611694, "rewards/margins": 0.7966189980506897, "rewards/rejected": 0.6565645337104797, "step": 3627 }, { "epoch": 1.96, "learning_rate": 5.417707359979963e-08, "logits/chosen": -2.0745115280151367, "logits/rejected": -2.328366279602051, "logps/chosen": -0.3147038519382477, "logps/rejected": -0.3282829523086548, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 1.0757439136505127, "rewards/margins": -0.005002498626708984, "rewards/rejected": 1.0807464122772217, "step": 3628 }, { "epoch": 1.96, "learning_rate": 5.415531169373286e-08, "logits/chosen": -2.365603446960449, "logits/rejected": -2.310199737548828, "logps/chosen": -20.547658920288086, "logps/rejected": -5.910229682922363, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": 2.363125801086426, "rewards/margins": 1.976118803024292, "rewards/rejected": 0.3870069682598114, "step": 3629 }, { "epoch": 1.96, "learning_rate": 5.413354899501289e-08, "logits/chosen": -1.9602290391921997, "logits/rejected": -1.9631354808807373, "logps/chosen": -2.491878032684326, "logps/rejected": -0.6229082345962524, "loss": 0.6256, "rewards/accuracies": 1.0, "rewards/chosen": 1.1222670078277588, "rewards/margins": 0.14002680778503418, "rewards/rejected": 0.9822402000427246, "step": 3630 }, { "epoch": 1.96, "learning_rate": 5.4111785507791086e-08, "logits/chosen": -2.091599941253662, "logits/rejected": -2.0712924003601074, "logps/chosen": -8.047863006591797, "logps/rejected": -1.6304322481155396, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 1.6876106262207031, "rewards/margins": 0.8446629643440247, "rewards/rejected": 0.8429476618766785, "step": 3631 }, { "epoch": 1.96, "learning_rate": 5.4090021236218973e-08, "logits/chosen": -1.929545521736145, "logits/rejected": -2.2699785232543945, "logps/chosen": -0.3640846014022827, "logps/rejected": -0.3095863461494446, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 0.9509373903274536, "rewards/margins": -0.0021939873695373535, "rewards/rejected": 0.953131377696991, "step": 3632 }, { "epoch": 1.96, "learning_rate": 5.4068256184448234e-08, "logits/chosen": -2.0914688110351562, "logits/rejected": -2.3215935230255127, "logps/chosen": -0.2989498972892761, "logps/rejected": -0.2731834053993225, "loss": 0.6724, "rewards/accuracies": 1.0, "rewards/chosen": 0.879585862159729, "rewards/margins": 0.041986286640167236, "rewards/rejected": 0.8375995755195618, "step": 3633 }, { "epoch": 1.96, "learning_rate": 5.4046490356630706e-08, "logits/chosen": -2.2288906574249268, "logits/rejected": -2.103566884994507, "logps/chosen": -21.369752883911133, "logps/rejected": -11.54580307006836, "loss": 0.2397, "rewards/accuracies": 1.0, "rewards/chosen": 1.8036367893218994, "rewards/margins": 1.3059242963790894, "rewards/rejected": 0.49771252274513245, "step": 3634 }, { "epoch": 1.96, "learning_rate": 5.4024723756918346e-08, "logits/chosen": -2.0715019702911377, "logits/rejected": -2.332252264022827, "logps/chosen": -0.4024304747581482, "logps/rejected": -0.3560556173324585, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.8369898200035095, "rewards/margins": 0.015328168869018555, "rewards/rejected": 0.821661651134491, "step": 3635 }, { "epoch": 1.96, "learning_rate": 5.40029563894633e-08, "logits/chosen": -2.047163248062134, "logits/rejected": -2.307330369949341, "logps/chosen": -0.45984938740730286, "logps/rejected": -0.5463608503341675, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9504408836364746, "rewards/margins": 0.0015348196029663086, "rewards/rejected": 0.9489060640335083, "step": 3636 }, { "epoch": 1.96, "learning_rate": 5.398118825841781e-08, "logits/chosen": -2.2468724250793457, "logits/rejected": -2.2450578212738037, "logps/chosen": -3.1792004108428955, "logps/rejected": -5.395786285400391, "loss": 0.4631, "rewards/accuracies": 1.0, "rewards/chosen": 0.9051036238670349, "rewards/margins": 0.5293765068054199, "rewards/rejected": 0.3757270872592926, "step": 3637 }, { "epoch": 1.96, "learning_rate": 5.395941936793431e-08, "logits/chosen": -2.0650806427001953, "logits/rejected": -2.2610535621643066, "logps/chosen": -0.8563545942306519, "logps/rejected": -0.9188370704650879, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9354700446128845, "rewards/margins": 0.012468338012695312, "rewards/rejected": 0.9230017066001892, "step": 3638 }, { "epoch": 1.96, "learning_rate": 5.3937649722165356e-08, "logits/chosen": -2.0952658653259277, "logits/rejected": -2.1226134300231934, "logps/chosen": -17.809873580932617, "logps/rejected": -14.871808052062988, "loss": 0.1159, "rewards/accuracies": 1.0, "rewards/chosen": 2.4176249504089355, "rewards/margins": 2.0962796211242676, "rewards/rejected": 0.32134541869163513, "step": 3639 }, { "epoch": 1.96, "learning_rate": 5.3915879325263645e-08, "logits/chosen": -2.012047529220581, "logits/rejected": -2.006234645843506, "logps/chosen": -3.3189969062805176, "logps/rejected": -3.613373041152954, "loss": 0.3451, "rewards/accuracies": 1.0, "rewards/chosen": 1.4805885553359985, "rewards/margins": 0.8864027857780457, "rewards/rejected": 0.5941857695579529, "step": 3640 }, { "epoch": 1.96, "learning_rate": 5.389410818138205e-08, "logits/chosen": -2.012483835220337, "logits/rejected": -2.0047624111175537, "logps/chosen": -5.077771186828613, "logps/rejected": -0.8991668820381165, "loss": 0.4986, "rewards/accuracies": 1.0, "rewards/chosen": 1.2173106670379639, "rewards/margins": 0.43637871742248535, "rewards/rejected": 0.7809319496154785, "step": 3641 }, { "epoch": 1.96, "learning_rate": 5.3872336294673527e-08, "logits/chosen": -2.155580997467041, "logits/rejected": -2.2925727367401123, "logps/chosen": -1.5617129802703857, "logps/rejected": -1.1076626777648926, "loss": 0.6475, "rewards/accuracies": 1.0, "rewards/chosen": 0.9594538807868958, "rewards/margins": 0.0934516191482544, "rewards/rejected": 0.8660022616386414, "step": 3642 }, { "epoch": 1.96, "learning_rate": 5.38505636692912e-08, "logits/chosen": -2.254804849624634, "logits/rejected": -2.342364549636841, "logps/chosen": -1.0463991165161133, "logps/rejected": -1.1011871099472046, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 1.0505067110061646, "rewards/margins": -0.006843924522399902, "rewards/rejected": 1.0573506355285645, "step": 3643 }, { "epoch": 1.97, "learning_rate": 5.38287903093884e-08, "logits/chosen": -2.183079481124878, "logits/rejected": -2.3198461532592773, "logps/chosen": -3.7516674995422363, "logps/rejected": -10.605661392211914, "loss": 0.5127, "rewards/accuracies": 1.0, "rewards/chosen": 1.0052766799926758, "rewards/margins": 0.400674045085907, "rewards/rejected": 0.6046026349067688, "step": 3644 }, { "epoch": 1.97, "learning_rate": 5.380701621911848e-08, "logits/chosen": -2.001370668411255, "logits/rejected": -2.000755786895752, "logps/chosen": -0.7582508325576782, "logps/rejected": -2.9652676582336426, "loss": 0.5194, "rewards/accuracies": 1.0, "rewards/chosen": 1.1500357389450073, "rewards/margins": 0.38422298431396484, "rewards/rejected": 0.7658127546310425, "step": 3645 }, { "epoch": 1.97, "learning_rate": 5.378524140263503e-08, "logits/chosen": -2.0780045986175537, "logits/rejected": -2.269456148147583, "logps/chosen": -0.9369169473648071, "logps/rejected": -1.3529942035675049, "loss": 0.732, "rewards/accuracies": 0.0, "rewards/chosen": 0.870364785194397, "rewards/margins": -0.07632702589035034, "rewards/rejected": 0.9466918110847473, "step": 3646 }, { "epoch": 1.97, "learning_rate": 5.376346586409173e-08, "logits/chosen": -2.0415072441101074, "logits/rejected": -2.035616874694824, "logps/chosen": -3.600480079650879, "logps/rejected": -1.3882436752319336, "loss": 0.6464, "rewards/accuracies": 1.0, "rewards/chosen": 0.8533377647399902, "rewards/margins": 0.09587603807449341, "rewards/rejected": 0.7574617266654968, "step": 3647 }, { "epoch": 1.97, "learning_rate": 5.3741689607642394e-08, "logits/chosen": -2.132931709289551, "logits/rejected": -2.114835262298584, "logps/chosen": -1.386531114578247, "logps/rejected": -9.017032623291016, "loss": 0.4244, "rewards/accuracies": 1.0, "rewards/chosen": 1.2309025526046753, "rewards/margins": 0.63731849193573, "rewards/rejected": 0.5935840606689453, "step": 3648 }, { "epoch": 1.97, "learning_rate": 5.371991263744099e-08, "logits/chosen": -2.209303617477417, "logits/rejected": -2.0721001625061035, "logps/chosen": -42.26111602783203, "logps/rejected": -3.3322577476501465, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 2.842604875564575, "rewards/margins": 2.1559088230133057, "rewards/rejected": 0.6866961121559143, "step": 3649 }, { "epoch": 1.97, "learning_rate": 5.3698134957641625e-08, "logits/chosen": -1.984540343284607, "logits/rejected": -1.9844027757644653, "logps/chosen": -1.631079912185669, "logps/rejected": -2.8475661277770996, "loss": 0.5869, "rewards/accuracies": 1.0, "rewards/chosen": 1.115670084953308, "rewards/margins": 0.22509831190109253, "rewards/rejected": 0.8905717730522156, "step": 3650 }, { "epoch": 1.97, "learning_rate": 5.367635657239855e-08, "logits/chosen": -2.019505023956299, "logits/rejected": -2.0277209281921387, "logps/chosen": -8.019850730895996, "logps/rejected": -1.5539907217025757, "loss": 0.6164, "rewards/accuracies": 1.0, "rewards/chosen": 1.2412816286087036, "rewards/margins": 0.1598522663116455, "rewards/rejected": 1.081429362297058, "step": 3651 }, { "epoch": 1.97, "learning_rate": 5.365457748586611e-08, "logits/chosen": -1.986094355583191, "logits/rejected": -1.9883394241333008, "logps/chosen": -1.8890990018844604, "logps/rejected": -3.778244972229004, "loss": 0.4877, "rewards/accuracies": 1.0, "rewards/chosen": 0.9593254327774048, "rewards/margins": 0.46430590748786926, "rewards/rejected": 0.4950195252895355, "step": 3652 }, { "epoch": 1.97, "learning_rate": 5.363279770219884e-08, "logits/chosen": -2.060912609100342, "logits/rejected": -2.0560131072998047, "logps/chosen": -3.004176616668701, "logps/rejected": -6.561220645904541, "loss": 0.2288, "rewards/accuracies": 1.0, "rewards/chosen": 1.556674838066101, "rewards/margins": 1.3581597805023193, "rewards/rejected": 0.19851508736610413, "step": 3653 }, { "epoch": 1.97, "learning_rate": 5.3611017225551346e-08, "logits/chosen": -2.1591947078704834, "logits/rejected": -2.150681734085083, "logps/chosen": -5.085165023803711, "logps/rejected": -4.760881423950195, "loss": 0.6274, "rewards/accuracies": 1.0, "rewards/chosen": 0.881476104259491, "rewards/margins": 0.1361275315284729, "rewards/rejected": 0.7453485727310181, "step": 3654 }, { "epoch": 1.97, "learning_rate": 5.358923606007841e-08, "logits/chosen": -2.1130785942077637, "logits/rejected": -2.354384422302246, "logps/chosen": -2.027676582336426, "logps/rejected": -2.239769697189331, "loss": 0.6732, "rewards/accuracies": 1.0, "rewards/chosen": 0.693688690662384, "rewards/margins": 0.040288448333740234, "rewards/rejected": 0.6534002423286438, "step": 3655 }, { "epoch": 1.97, "learning_rate": 5.356745420993495e-08, "logits/chosen": -2.061162233352661, "logits/rejected": -2.285522222518921, "logps/chosen": -4.5440144538879395, "logps/rejected": -0.6678183078765869, "loss": 0.8142, "rewards/accuracies": 0.0, "rewards/chosen": 0.479480117559433, "rewards/margins": -0.22894558310508728, "rewards/rejected": 0.7084257006645203, "step": 3656 }, { "epoch": 1.97, "learning_rate": 5.3545671679275974e-08, "logits/chosen": -2.1494369506835938, "logits/rejected": -2.3313562870025635, "logps/chosen": -7.101583480834961, "logps/rejected": -10.399408340454102, "loss": 0.7111, "rewards/accuracies": 0.0, "rewards/chosen": 0.9581832885742188, "rewards/margins": -0.035661160945892334, "rewards/rejected": 0.9938444495201111, "step": 3657 }, { "epoch": 1.97, "learning_rate": 5.352388847225665e-08, "logits/chosen": -2.1338164806365967, "logits/rejected": -2.1358890533447266, "logps/chosen": -0.606826663017273, "logps/rejected": -4.773114204406738, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 1.0905581712722778, "rewards/margins": 0.6347821950912476, "rewards/rejected": 0.4557759463787079, "step": 3658 }, { "epoch": 1.97, "learning_rate": 5.350210459303228e-08, "logits/chosen": -2.126983880996704, "logits/rejected": -2.2727506160736084, "logps/chosen": -2.199129819869995, "logps/rejected": -2.0139293670654297, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.9575549364089966, "rewards/margins": 0.029628098011016846, "rewards/rejected": 0.9279268383979797, "step": 3659 }, { "epoch": 1.97, "learning_rate": 5.3480320045758264e-08, "logits/chosen": -2.128725290298462, "logits/rejected": -2.1285831928253174, "logps/chosen": -0.8863207697868347, "logps/rejected": -1.3091002702713013, "loss": 0.5584, "rewards/accuracies": 1.0, "rewards/chosen": 1.0624957084655762, "rewards/margins": 0.2904127836227417, "rewards/rejected": 0.7720829248428345, "step": 3660 }, { "epoch": 1.97, "learning_rate": 5.345853483459017e-08, "logits/chosen": -2.0849356651306152, "logits/rejected": -2.069603443145752, "logps/chosen": -12.202630996704102, "logps/rejected": -4.362443447113037, "loss": 0.3091, "rewards/accuracies": 1.0, "rewards/chosen": 1.5074281692504883, "rewards/margins": 1.0157320499420166, "rewards/rejected": 0.4916961193084717, "step": 3661 }, { "epoch": 1.98, "learning_rate": 5.3436748963683674e-08, "logits/chosen": -1.9857255220413208, "logits/rejected": -1.9858840703964233, "logps/chosen": -0.5648418664932251, "logps/rejected": -2.17474365234375, "loss": 0.6179, "rewards/accuracies": 1.0, "rewards/chosen": 0.9982921481132507, "rewards/margins": 0.15665459632873535, "rewards/rejected": 0.8416375517845154, "step": 3662 }, { "epoch": 1.98, "learning_rate": 5.3414962437194554e-08, "logits/chosen": -2.1258795261383057, "logits/rejected": -2.118934154510498, "logps/chosen": -4.5201520919799805, "logps/rejected": -6.09279727935791, "loss": 0.327, "rewards/accuracies": 1.0, "rewards/chosen": 1.5171315670013428, "rewards/margins": 0.9497351050376892, "rewards/rejected": 0.5673964619636536, "step": 3663 }, { "epoch": 1.98, "learning_rate": 5.3393175259278744e-08, "logits/chosen": -2.0359766483306885, "logits/rejected": -2.0350961685180664, "logps/chosen": -0.28694260120391846, "logps/rejected": -5.628061294555664, "loss": 0.4466, "rewards/accuracies": 1.0, "rewards/chosen": 0.9596778154373169, "rewards/margins": 0.5744402408599854, "rewards/rejected": 0.38523760437965393, "step": 3664 }, { "epoch": 1.98, "learning_rate": 5.3371387434092285e-08, "logits/chosen": -2.032904624938965, "logits/rejected": -2.033900260925293, "logps/chosen": -2.754124879837036, "logps/rejected": -1.114784598350525, "loss": 0.467, "rewards/accuracies": 1.0, "rewards/chosen": 1.4697120189666748, "rewards/margins": 0.5189302563667297, "rewards/rejected": 0.9507817625999451, "step": 3665 }, { "epoch": 1.98, "learning_rate": 5.3349598965791356e-08, "logits/chosen": -2.097417116165161, "logits/rejected": -2.102377414703369, "logps/chosen": -1.8224313259124756, "logps/rejected": -12.855051040649414, "loss": 0.5136, "rewards/accuracies": 1.0, "rewards/chosen": 1.358121395111084, "rewards/margins": 0.39853984117507935, "rewards/rejected": 0.9595815539360046, "step": 3666 }, { "epoch": 1.98, "learning_rate": 5.332780985853225e-08, "logits/chosen": -2.0740532875061035, "logits/rejected": -2.325929641723633, "logps/chosen": -0.3522924780845642, "logps/rejected": -0.34078091382980347, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.920504093170166, "rewards/margins": 0.03232991695404053, "rewards/rejected": 0.8881741762161255, "step": 3667 }, { "epoch": 1.98, "learning_rate": 5.330602011647139e-08, "logits/chosen": -2.130035638809204, "logits/rejected": -2.264735698699951, "logps/chosen": -3.577134370803833, "logps/rejected": -3.5676515102386475, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.7856870889663696, "rewards/margins": -0.0033836960792541504, "rewards/rejected": 0.7890707850456238, "step": 3668 }, { "epoch": 1.98, "learning_rate": 5.328422974376531e-08, "logits/chosen": -2.0593528747558594, "logits/rejected": -2.306666612625122, "logps/chosen": -0.31877830624580383, "logps/rejected": -0.3613259792327881, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9315038919448853, "rewards/margins": 0.021606087684631348, "rewards/rejected": 0.9098978042602539, "step": 3669 }, { "epoch": 1.98, "learning_rate": 5.326243874457067e-08, "logits/chosen": -2.0122292041778564, "logits/rejected": -2.245861768722534, "logps/chosen": -0.6944581866264343, "logps/rejected": -0.8255863189697266, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.874725341796875, "rewards/margins": 0.0036786794662475586, "rewards/rejected": 0.8710466623306274, "step": 3670 }, { "epoch": 1.98, "learning_rate": 5.3240647123044225e-08, "logits/chosen": -1.9463447332382202, "logits/rejected": -1.9455844163894653, "logps/chosen": -0.7617964148521423, "logps/rejected": -1.4353405237197876, "loss": 0.6142, "rewards/accuracies": 1.0, "rewards/chosen": 0.955678403377533, "rewards/margins": 0.16469818353652954, "rewards/rejected": 0.7909802198410034, "step": 3671 }, { "epoch": 1.98, "learning_rate": 5.32188548833429e-08, "logits/chosen": -2.1820271015167236, "logits/rejected": -2.301361560821533, "logps/chosen": -7.290087699890137, "logps/rejected": -7.171544075012207, "loss": 0.6986, "rewards/accuracies": 0.0, "rewards/chosen": 0.47340163588523865, "rewards/margins": -0.010893821716308594, "rewards/rejected": 0.48429545760154724, "step": 3672 }, { "epoch": 1.98, "learning_rate": 5.3197062029623697e-08, "logits/chosen": -2.1647448539733887, "logits/rejected": -2.3088388442993164, "logps/chosen": -1.989006757736206, "logps/rejected": -2.221431016921997, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": 0.7472615242004395, "rewards/margins": 0.040667831897735596, "rewards/rejected": 0.7065936923027039, "step": 3673 }, { "epoch": 1.98, "learning_rate": 5.3175268566043754e-08, "logits/chosen": -2.0738611221313477, "logits/rejected": -1.9700708389282227, "logps/chosen": -24.326677322387695, "logps/rejected": -8.446791648864746, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 2.1550936698913574, "rewards/margins": 1.8798704147338867, "rewards/rejected": 0.2752232551574707, "step": 3674 }, { "epoch": 1.98, "learning_rate": 5.315347449676031e-08, "logits/chosen": -2.0009877681732178, "logits/rejected": -2.270909070968628, "logps/chosen": -0.9469007849693298, "logps/rejected": -0.8482908606529236, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 1.0211901664733887, "rewards/margins": 0.013028264045715332, "rewards/rejected": 1.0081619024276733, "step": 3675 }, { "epoch": 1.98, "learning_rate": 5.313167982593072e-08, "logits/chosen": -2.2092602252960205, "logits/rejected": -2.1428136825561523, "logps/chosen": -22.589111328125, "logps/rejected": -1.9932399988174438, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 2.168983221054077, "rewards/margins": 1.5422471761703491, "rewards/rejected": 0.626736044883728, "step": 3676 }, { "epoch": 1.98, "learning_rate": 5.310988455771248e-08, "logits/chosen": -2.0401499271392822, "logits/rejected": -2.2463037967681885, "logps/chosen": -0.5009034276008606, "logps/rejected": -0.49998220801353455, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.8066224455833435, "rewards/margins": 0.028779268264770508, "rewards/rejected": 0.777843177318573, "step": 3677 }, { "epoch": 1.98, "learning_rate": 5.308808869626318e-08, "logits/chosen": -2.217207193374634, "logits/rejected": -2.213775634765625, "logps/chosen": -6.274115085601807, "logps/rejected": -4.1011223793029785, "loss": 0.4721, "rewards/accuracies": 1.0, "rewards/chosen": 0.9440318942070007, "rewards/margins": 0.5052103400230408, "rewards/rejected": 0.43882155418395996, "step": 3678 }, { "epoch": 1.98, "learning_rate": 5.306629224574052e-08, "logits/chosen": -1.9273943901062012, "logits/rejected": -2.2659246921539307, "logps/chosen": -0.17560744285583496, "logps/rejected": -0.1771697700023651, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.8157283067703247, "rewards/margins": 0.005568444728851318, "rewards/rejected": 0.8101598620414734, "step": 3679 }, { "epoch": 1.98, "learning_rate": 5.304449521030231e-08, "logits/chosen": -2.0759880542755127, "logits/rejected": -2.081372022628784, "logps/chosen": -0.6314162015914917, "logps/rejected": -3.644310235977173, "loss": 0.4682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9524656534194946, "rewards/margins": 0.5156991481781006, "rewards/rejected": 0.43676653504371643, "step": 3680 }, { "epoch": 1.99, "learning_rate": 5.302269759410648e-08, "logits/chosen": -2.0584309101104736, "logits/rejected": -2.0666146278381348, "logps/chosen": -0.4209555685520172, "logps/rejected": -8.0753755569458, "loss": 0.375, "rewards/accuracies": 1.0, "rewards/chosen": 1.0262422561645508, "rewards/margins": 0.7875322103500366, "rewards/rejected": 0.23871003091335297, "step": 3681 }, { "epoch": 1.99, "learning_rate": 5.3000899401311085e-08, "logits/chosen": -2.0474326610565186, "logits/rejected": -2.252455234527588, "logps/chosen": -0.40301400423049927, "logps/rejected": -0.404124915599823, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 1.011457920074463, "rewards/margins": -0.003296494483947754, "rewards/rejected": 1.0147544145584106, "step": 3682 }, { "epoch": 1.99, "learning_rate": 5.297910063607425e-08, "logits/chosen": -2.0671653747558594, "logits/rejected": -2.0680110454559326, "logps/chosen": -1.3657653331756592, "logps/rejected": -2.4667537212371826, "loss": 0.544, "rewards/accuracies": 1.0, "rewards/chosen": 1.21430242061615, "rewards/margins": 0.32455146312713623, "rewards/rejected": 0.8897509574890137, "step": 3683 }, { "epoch": 1.99, "learning_rate": 5.2957301302554255e-08, "logits/chosen": -1.9910926818847656, "logits/rejected": -2.0169475078582764, "logps/chosen": -6.931237697601318, "logps/rejected": -24.03497314453125, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 1.2244608402252197, "rewards/margins": 0.018871545791625977, "rewards/rejected": 1.2055892944335938, "step": 3684 }, { "epoch": 1.99, "learning_rate": 5.293550140490948e-08, "logits/chosen": -2.124105930328369, "logits/rejected": -2.2398622035980225, "logps/chosen": -0.6558026671409607, "logps/rejected": -0.7032850384712219, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 1.023706316947937, "rewards/margins": -0.0031124353408813477, "rewards/rejected": 1.0268187522888184, "step": 3685 }, { "epoch": 1.99, "learning_rate": 5.291370094729836e-08, "logits/chosen": -2.0611305236816406, "logits/rejected": -2.0621912479400635, "logps/chosen": -0.186442568898201, "logps/rejected": -5.585810661315918, "loss": 0.4101, "rewards/accuracies": 1.0, "rewards/chosen": 0.996258556842804, "rewards/margins": 0.6792681813240051, "rewards/rejected": 0.31699037551879883, "step": 3686 }, { "epoch": 1.99, "learning_rate": 5.289189993387949e-08, "logits/chosen": -2.167921543121338, "logits/rejected": -2.1646783351898193, "logps/chosen": -3.1292126178741455, "logps/rejected": -3.9600250720977783, "loss": 0.3294, "rewards/accuracies": 1.0, "rewards/chosen": 1.5820552110671997, "rewards/margins": 0.9412152171134949, "rewards/rejected": 0.6408399939537048, "step": 3687 }, { "epoch": 1.99, "learning_rate": 5.2870098368811566e-08, "logits/chosen": -2.031771421432495, "logits/rejected": -2.0156500339508057, "logps/chosen": -7.112964153289795, "logps/rejected": -4.994486331939697, "loss": 0.3447, "rewards/accuracies": 1.0, "rewards/chosen": 1.35771644115448, "rewards/margins": 0.887855052947998, "rewards/rejected": 0.46986135840415955, "step": 3688 }, { "epoch": 1.99, "learning_rate": 5.2848296256253387e-08, "logits/chosen": -2.212639570236206, "logits/rejected": -2.2033143043518066, "logps/chosen": -7.044456481933594, "logps/rejected": -3.789233446121216, "loss": 0.4467, "rewards/accuracies": 1.0, "rewards/chosen": 1.3067922592163086, "rewards/margins": 0.5743463635444641, "rewards/rejected": 0.7324458956718445, "step": 3689 }, { "epoch": 1.99, "learning_rate": 5.2826493600363845e-08, "logits/chosen": -2.206329345703125, "logits/rejected": -2.313864231109619, "logps/chosen": -0.8707219362258911, "logps/rejected": -0.9552516341209412, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9589501619338989, "rewards/margins": 0.017126083374023438, "rewards/rejected": 0.9418240785598755, "step": 3690 }, { "epoch": 1.99, "learning_rate": 5.280469040530193e-08, "logits/chosen": -2.156890392303467, "logits/rejected": -2.1510872840881348, "logps/chosen": -6.933272838592529, "logps/rejected": -5.969849586486816, "loss": 0.3702, "rewards/accuracies": 1.0, "rewards/chosen": 1.1019439697265625, "rewards/margins": 0.8028780817985535, "rewards/rejected": 0.29906588792800903, "step": 3691 }, { "epoch": 1.99, "learning_rate": 5.2782886675226767e-08, "logits/chosen": -1.9721888303756714, "logits/rejected": -1.9714094400405884, "logps/chosen": -0.5881879925727844, "logps/rejected": -2.665724039077759, "loss": 0.5738, "rewards/accuracies": 1.0, "rewards/chosen": 0.9428672194480896, "rewards/margins": 0.2547915577888489, "rewards/rejected": 0.6880756616592407, "step": 3692 }, { "epoch": 1.99, "learning_rate": 5.276108241429753e-08, "logits/chosen": -2.054983139038086, "logits/rejected": -2.2806780338287354, "logps/chosen": -1.3219339847564697, "logps/rejected": -1.885657787322998, "loss": 0.7035, "rewards/accuracies": 0.0, "rewards/chosen": 0.882868230342865, "rewards/margins": -0.020513534545898438, "rewards/rejected": 0.9033817648887634, "step": 3693 }, { "epoch": 1.99, "learning_rate": 5.273927762667356e-08, "logits/chosen": -2.0084400177001953, "logits/rejected": -2.00211763381958, "logps/chosen": -3.100466251373291, "logps/rejected": -6.083026885986328, "loss": 0.4293, "rewards/accuracies": 1.0, "rewards/chosen": 0.8899837732315063, "rewards/margins": 0.6232304573059082, "rewards/rejected": 0.26675328612327576, "step": 3694 }, { "epoch": 1.99, "learning_rate": 5.271747231651424e-08, "logits/chosen": -2.173266649246216, "logits/rejected": -2.1871767044067383, "logps/chosen": -10.451748847961426, "logps/rejected": -11.024694442749023, "loss": 0.26, "rewards/accuracies": 1.0, "rewards/chosen": 1.9291950464248657, "rewards/margins": 1.2140613794326782, "rewards/rejected": 0.7151336669921875, "step": 3695 }, { "epoch": 1.99, "learning_rate": 5.269566648797909e-08, "logits/chosen": -2.158201217651367, "logits/rejected": -2.1554794311523438, "logps/chosen": -3.6507716178894043, "logps/rejected": -3.881014347076416, "loss": 0.5486, "rewards/accuracies": 1.0, "rewards/chosen": 0.793388307094574, "rewards/margins": 0.31348302960395813, "rewards/rejected": 0.47990527749061584, "step": 3696 }, { "epoch": 1.99, "learning_rate": 5.2673860145227724e-08, "logits/chosen": -1.9874022006988525, "logits/rejected": -2.3363523483276367, "logps/chosen": -15.369453430175781, "logps/rejected": -0.5159489512443542, "loss": 0.5329, "rewards/accuracies": 1.0, "rewards/chosen": 1.1753151416778564, "rewards/margins": 0.35113030672073364, "rewards/rejected": 0.8241848349571228, "step": 3697 }, { "epoch": 1.99, "learning_rate": 5.265205329241983e-08, "logits/chosen": -2.04880690574646, "logits/rejected": -2.2198286056518555, "logps/chosen": -0.2591947615146637, "logps/rejected": -0.27062785625457764, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.899120032787323, "rewards/margins": 0.023807108402252197, "rewards/rejected": 0.8753129243850708, "step": 3698 }, { "epoch": 2.0, "learning_rate": 5.26302459337152e-08, "logits/chosen": -2.000110149383545, "logits/rejected": -2.0003087520599365, "logps/chosen": -0.3856610059738159, "logps/rejected": -1.9355429410934448, "loss": 0.5631, "rewards/accuracies": 1.0, "rewards/chosen": 0.8895171284675598, "rewards/margins": 0.2796570658683777, "rewards/rejected": 0.6098600625991821, "step": 3699 }, { "epoch": 2.0, "learning_rate": 5.260843807327377e-08, "logits/chosen": -2.0917446613311768, "logits/rejected": -2.0900332927703857, "logps/chosen": -4.989665985107422, "logps/rejected": -3.8993163108825684, "loss": 0.2541, "rewards/accuracies": 1.0, "rewards/chosen": 1.7025024890899658, "rewards/margins": 1.2403101921081543, "rewards/rejected": 0.4621922969818115, "step": 3700 }, { "epoch": 2.0, "learning_rate": 5.2586629715255496e-08, "logits/chosen": -1.977220892906189, "logits/rejected": -1.9775199890136719, "logps/chosen": -0.1196359172463417, "logps/rejected": -8.007227897644043, "loss": 0.4777, "rewards/accuracies": 1.0, "rewards/chosen": 0.8298857808113098, "rewards/margins": 0.49054786562919617, "rewards/rejected": 0.33933791518211365, "step": 3701 }, { "epoch": 2.0, "learning_rate": 5.256482086382048e-08, "logits/chosen": -2.140373945236206, "logits/rejected": -2.3086020946502686, "logps/chosen": -5.898342132568359, "logps/rejected": -5.675469875335693, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.5865230560302734, "rewards/margins": -0.008078157901763916, "rewards/rejected": 0.5946012139320374, "step": 3702 }, { "epoch": 2.0, "learning_rate": 5.254301152312891e-08, "logits/chosen": -2.078094005584717, "logits/rejected": -2.2618942260742188, "logps/chosen": -9.533387184143066, "logps/rejected": -5.3098464012146, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0841940641403198, "rewards/margins": 0.02349853515625, "rewards/rejected": 1.0606955289840698, "step": 3703 }, { "epoch": 2.0, "learning_rate": 5.2521201697341055e-08, "logits/chosen": -2.1033926010131836, "logits/rejected": -2.1081578731536865, "logps/chosen": -2.0369648933410645, "logps/rejected": -3.6938657760620117, "loss": 0.4585, "rewards/accuracies": 1.0, "rewards/chosen": 1.0432718992233276, "rewards/margins": 0.5417638421058655, "rewards/rejected": 0.5015080571174622, "step": 3704 }, { "epoch": 2.0, "learning_rate": 5.2499391390617285e-08, "logits/chosen": -2.0502190589904785, "logits/rejected": -2.0510735511779785, "logps/chosen": -3.1639177799224854, "logps/rejected": -6.278886318206787, "loss": 0.2819, "rewards/accuracies": 1.0, "rewards/chosen": 1.6109046936035156, "rewards/margins": 1.1218737363815308, "rewards/rejected": 0.48903098702430725, "step": 3705 }, { "epoch": 2.0, "learning_rate": 5.2477580607118065e-08, "logits/chosen": -2.206073522567749, "logits/rejected": -2.205362319946289, "logps/chosen": -1.5837664604187012, "logps/rejected": -6.788594722747803, "loss": 0.3573, "rewards/accuracies": 1.0, "rewards/chosen": 1.1137431859970093, "rewards/margins": 0.8451895713806152, "rewards/rejected": 0.26855358481407166, "step": 3706 }, { "epoch": 2.0, "learning_rate": 5.245576935100394e-08, "logits/chosen": -2.1027283668518066, "logits/rejected": -2.107924222946167, "logps/chosen": -0.453050822019577, "logps/rejected": -16.712425231933594, "loss": 0.3401, "rewards/accuracies": 1.0, "rewards/chosen": 0.9553308486938477, "rewards/margins": 0.9035425186157227, "rewards/rejected": 0.051788330078125, "step": 3707 }, { "epoch": 2.0, "learning_rate": 5.2433957626435565e-08, "logits/chosen": -2.1409597396850586, "logits/rejected": -2.133756399154663, "logps/chosen": -5.487308025360107, "logps/rejected": -4.502067565917969, "loss": 0.3892, "rewards/accuracies": 1.0, "rewards/chosen": 1.2668291330337524, "rewards/margins": 0.7426469922065735, "rewards/rejected": 0.524182140827179, "step": 3708 }, { "epoch": 2.0, "learning_rate": 5.2412145437573663e-08, "logits/chosen": -2.0616517066955566, "logits/rejected": -2.286984920501709, "logps/chosen": -7.31498908996582, "logps/rejected": -4.809916973114014, "loss": 0.7484, "rewards/accuracies": 0.0, "rewards/chosen": 0.9883729815483093, "rewards/margins": -0.10756498575210571, "rewards/rejected": 1.095937967300415, "step": 3709 }, { "epoch": 2.0, "learning_rate": 5.239033278857904e-08, "logits/chosen": -2.1508243083953857, "logits/rejected": -2.137122869491577, "logps/chosen": -1.4626730680465698, "logps/rejected": -8.454195022583008, "loss": 0.3444, "rewards/accuracies": 1.0, "rewards/chosen": 1.1853734254837036, "rewards/margins": 0.8889007568359375, "rewards/rejected": 0.2964726388454437, "step": 3710 }, { "epoch": 2.0, "learning_rate": 5.236851968361261e-08, "logits/chosen": -2.114466428756714, "logits/rejected": -2.100851535797119, "logps/chosen": -3.151431083679199, "logps/rejected": -5.558091640472412, "loss": 0.4194, "rewards/accuracies": 1.0, "rewards/chosen": 1.0367056131362915, "rewards/margins": 0.6518300771713257, "rewards/rejected": 0.3848755359649658, "step": 3711 }, { "epoch": 2.0, "learning_rate": 5.234670612683538e-08, "logits/chosen": -1.9859801530838013, "logits/rejected": -1.98728609085083, "logps/chosen": -3.543386459350586, "logps/rejected": -3.8531103134155273, "loss": 0.2518, "rewards/accuracies": 1.0, "rewards/chosen": 1.7523609399795532, "rewards/margins": 1.2504527568817139, "rewards/rejected": 0.5019082427024841, "step": 3712 }, { "epoch": 2.0, "learning_rate": 5.232489212240843e-08, "logits/chosen": -2.022789239883423, "logits/rejected": -2.2970526218414307, "logps/chosen": -1.0569034814834595, "logps/rejected": -0.7755873799324036, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.8660707473754883, "rewards/margins": 0.029528141021728516, "rewards/rejected": 0.8365426063537598, "step": 3713 }, { "epoch": 2.0, "learning_rate": 5.23030776744929e-08, "logits/chosen": -2.1805789470672607, "logits/rejected": -2.1852986812591553, "logps/chosen": -0.17925551533699036, "logps/rejected": -4.857945442199707, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 0.8993444442749023, "rewards/margins": 0.5633188486099243, "rewards/rejected": 0.3360256254673004, "step": 3714 }, { "epoch": 2.0, "learning_rate": 5.228126278725007e-08, "logits/chosen": -2.1409008502960205, "logits/rejected": -2.317136526107788, "logps/chosen": -1.3514189720153809, "logps/rejected": -1.2549445629119873, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.9953798651695251, "rewards/margins": -0.016893327236175537, "rewards/rejected": 1.0122731924057007, "step": 3715 }, { "epoch": 2.0, "learning_rate": 5.225944746484126e-08, "logits/chosen": -2.0247628688812256, "logits/rejected": -2.0217158794403076, "logps/chosen": -0.23617607355117798, "logps/rejected": -5.249111175537109, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 0.9763687252998352, "rewards/margins": 0.5833815336227417, "rewards/rejected": 0.3929871618747711, "step": 3716 }, { "epoch": 2.0, "learning_rate": 5.223763171142789e-08, "logits/chosen": -2.071803569793701, "logits/rejected": -2.340688467025757, "logps/chosen": -16.675827026367188, "logps/rejected": -14.055713653564453, "loss": 0.7738, "rewards/accuracies": 0.0, "rewards/chosen": 0.23695392906665802, "rewards/margins": -0.15534763038158417, "rewards/rejected": 0.3923015594482422, "step": 3717 }, { "epoch": 2.01, "learning_rate": 5.221581553117145e-08, "logits/chosen": -1.9735243320465088, "logits/rejected": -2.2752487659454346, "logps/chosen": -0.7447187900543213, "logps/rejected": -0.7093822360038757, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 1.0447713136672974, "rewards/margins": 0.023874521255493164, "rewards/rejected": 1.0208967924118042, "step": 3718 }, { "epoch": 2.01, "learning_rate": 5.2193998928233526e-08, "logits/chosen": -2.1798672676086426, "logits/rejected": -2.186993360519409, "logps/chosen": -2.3736536502838135, "logps/rejected": -6.465550899505615, "loss": 0.2863, "rewards/accuracies": 1.0, "rewards/chosen": 1.6194709539413452, "rewards/margins": 1.1041584014892578, "rewards/rejected": 0.5153125524520874, "step": 3719 }, { "epoch": 2.01, "learning_rate": 5.21721819067758e-08, "logits/chosen": -2.1239190101623535, "logits/rejected": -2.3414864540100098, "logps/chosen": -0.48855704069137573, "logps/rejected": -0.5446541905403137, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8915454745292664, "rewards/margins": 0.021052420139312744, "rewards/rejected": 0.8704930543899536, "step": 3720 }, { "epoch": 2.01, "learning_rate": 5.215036447096e-08, "logits/chosen": -2.1021385192871094, "logits/rejected": -2.2535572052001953, "logps/chosen": -0.1879400759935379, "logps/rejected": -0.22297289967536926, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.8153741955757141, "rewards/margins": 0.007732689380645752, "rewards/rejected": 0.8076415061950684, "step": 3721 }, { "epoch": 2.01, "learning_rate": 5.2128546624947914e-08, "logits/chosen": -2.090496063232422, "logits/rejected": -2.336613655090332, "logps/chosen": -0.3480602502822876, "logps/rejected": -0.3086106777191162, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.7564473748207092, "rewards/margins": 0.02105247974395752, "rewards/rejected": 0.7353948950767517, "step": 3722 }, { "epoch": 2.01, "learning_rate": 5.21067283729015e-08, "logits/chosen": -2.205655813217163, "logits/rejected": -2.1925244331359863, "logps/chosen": -9.908121109008789, "logps/rejected": -2.187131643295288, "loss": 0.4659, "rewards/accuracies": 1.0, "rewards/chosen": 1.5961512327194214, "rewards/margins": 0.5218639373779297, "rewards/rejected": 1.0742872953414917, "step": 3723 }, { "epoch": 2.01, "learning_rate": 5.208490971898268e-08, "logits/chosen": -2.1139426231384277, "logits/rejected": -2.110260248184204, "logps/chosen": -7.224118232727051, "logps/rejected": -3.6270523071289062, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": 1.3745449781417847, "rewards/margins": 0.8825522661209106, "rewards/rejected": 0.49199268221855164, "step": 3724 }, { "epoch": 2.01, "learning_rate": 5.206309066735354e-08, "logits/chosen": -2.149214506149292, "logits/rejected": -2.274836540222168, "logps/chosen": -2.957291603088379, "logps/rejected": -2.7357120513916016, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 1.0077019929885864, "rewards/margins": 0.003991842269897461, "rewards/rejected": 1.003710150718689, "step": 3725 }, { "epoch": 2.01, "learning_rate": 5.20412712221762e-08, "logits/chosen": -2.1062800884246826, "logits/rejected": -2.1035242080688477, "logps/chosen": -0.2458709180355072, "logps/rejected": -5.000790596008301, "loss": 0.4472, "rewards/accuracies": 1.0, "rewards/chosen": 0.9918710589408875, "rewards/margins": 0.5728136301040649, "rewards/rejected": 0.4190574586391449, "step": 3726 }, { "epoch": 2.01, "learning_rate": 5.2019451387612844e-08, "logits/chosen": -2.0359230041503906, "logits/rejected": -2.2861337661743164, "logps/chosen": -0.3668774962425232, "logps/rejected": -0.40344396233558655, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9517095685005188, "rewards/margins": -3.063678741455078e-05, "rewards/rejected": 0.9517402052879333, "step": 3727 }, { "epoch": 2.01, "learning_rate": 5.1997631167825786e-08, "logits/chosen": -2.1686980724334717, "logits/rejected": -2.302018880844116, "logps/chosen": -1.4750444889068604, "logps/rejected": -1.3436771631240845, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9671883583068848, "rewards/margins": 0.012619495391845703, "rewards/rejected": 0.9545688629150391, "step": 3728 }, { "epoch": 2.01, "learning_rate": 5.197581056697735e-08, "logits/chosen": -2.1797406673431396, "logits/rejected": -2.1175737380981445, "logps/chosen": -16.334213256835938, "logps/rejected": -9.877887725830078, "loss": 0.1762, "rewards/accuracies": 1.0, "rewards/chosen": 2.064357042312622, "rewards/margins": 1.646918773651123, "rewards/rejected": 0.4174383282661438, "step": 3729 }, { "epoch": 2.01, "learning_rate": 5.195398958922998e-08, "logits/chosen": -2.033892869949341, "logits/rejected": -2.022085189819336, "logps/chosen": -6.054530620574951, "logps/rejected": -15.523193359375, "loss": 0.4193, "rewards/accuracies": 1.0, "rewards/chosen": 1.2970682382583618, "rewards/margins": 0.6522785425186157, "rewards/rejected": 0.6447896957397461, "step": 3730 }, { "epoch": 2.01, "learning_rate": 5.1932168238746154e-08, "logits/chosen": -2.068638563156128, "logits/rejected": -2.3129355907440186, "logps/chosen": -0.2027297019958496, "logps/rejected": -0.22339877486228943, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.7695609927177429, "rewards/margins": 7.015466690063477e-05, "rewards/rejected": 0.7694908380508423, "step": 3731 }, { "epoch": 2.01, "learning_rate": 5.1910346519688466e-08, "logits/chosen": -2.0184757709503174, "logits/rejected": -2.240488290786743, "logps/chosen": -0.9019628763198853, "logps/rejected": -0.8379919528961182, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.8459958434104919, "rewards/margins": 0.009861588478088379, "rewards/rejected": 0.8361342549324036, "step": 3732 }, { "epoch": 2.01, "learning_rate": 5.18885244362195e-08, "logits/chosen": -2.125321626663208, "logits/rejected": -2.3030943870544434, "logps/chosen": -1.760031819343567, "logps/rejected": -1.559831142425537, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0827076435089111, "rewards/margins": 0.0029916763305664062, "rewards/rejected": 1.0797159671783447, "step": 3733 }, { "epoch": 2.01, "learning_rate": 5.186670199250201e-08, "logits/chosen": -2.2120635509490967, "logits/rejected": -2.203242301940918, "logps/chosen": -6.808483600616455, "logps/rejected": -3.783334970474243, "loss": 0.4384, "rewards/accuracies": 1.0, "rewards/chosen": 1.3303896188735962, "rewards/margins": 0.5973538756370544, "rewards/rejected": 0.7330357432365417, "step": 3734 }, { "epoch": 2.01, "learning_rate": 5.184487919269877e-08, "logits/chosen": -2.174666166305542, "logits/rejected": -2.2193024158477783, "logps/chosen": -1.4264023303985596, "logps/rejected": -1.9111663103103638, "loss": 0.666, "rewards/accuracies": 1.0, "rewards/chosen": 1.0596123933792114, "rewards/margins": 0.055034756660461426, "rewards/rejected": 1.00457763671875, "step": 3735 }, { "epoch": 2.02, "learning_rate": 5.182305604097259e-08, "logits/chosen": -1.9793198108673096, "logits/rejected": -1.9854005575180054, "logps/chosen": -2.6104846000671387, "logps/rejected": -4.774439811706543, "loss": 0.3483, "rewards/accuracies": 1.0, "rewards/chosen": 1.3773221969604492, "rewards/margins": 0.8754554390907288, "rewards/rejected": 0.5018667578697205, "step": 3736 }, { "epoch": 2.02, "learning_rate": 5.1801232541486415e-08, "logits/chosen": -2.2313594818115234, "logits/rejected": -2.225040912628174, "logps/chosen": -3.347681999206543, "logps/rejected": -1.7740206718444824, "loss": 0.5063, "rewards/accuracies": 1.0, "rewards/chosen": 1.153971552848816, "rewards/margins": 0.41679537296295166, "rewards/rejected": 0.7371761798858643, "step": 3737 }, { "epoch": 2.02, "learning_rate": 5.1779408698403203e-08, "logits/chosen": -2.0118303298950195, "logits/rejected": -2.1004562377929688, "logps/chosen": -12.224924087524414, "logps/rejected": -16.422393798828125, "loss": 0.4329, "rewards/accuracies": 1.0, "rewards/chosen": 1.6653581857681274, "rewards/margins": 0.6131366491317749, "rewards/rejected": 1.0522215366363525, "step": 3738 }, { "epoch": 2.02, "learning_rate": 5.175758451588601e-08, "logits/chosen": -2.0068459510803223, "logits/rejected": -2.0059823989868164, "logps/chosen": -5.677711486816406, "logps/rejected": -16.30582046508789, "loss": 0.4294, "rewards/accuracies": 1.0, "rewards/chosen": 0.8846031427383423, "rewards/margins": 0.6228645443916321, "rewards/rejected": 0.2617385983467102, "step": 3739 }, { "epoch": 2.02, "learning_rate": 5.173575999809794e-08, "logits/chosen": -2.1626224517822266, "logits/rejected": -2.3303534984588623, "logps/chosen": -0.5464808344841003, "logps/rejected": -0.6283169388771057, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 1.1098684072494507, "rewards/margins": 0.01978147029876709, "rewards/rejected": 1.0900869369506836, "step": 3740 }, { "epoch": 2.02, "learning_rate": 5.1713935149202145e-08, "logits/chosen": -2.01552152633667, "logits/rejected": -2.26098895072937, "logps/chosen": -0.5630654096603394, "logps/rejected": -0.6317006349563599, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.8467910885810852, "rewards/margins": 0.007721304893493652, "rewards/rejected": 0.8390697836875916, "step": 3741 }, { "epoch": 2.02, "learning_rate": 5.1692109973361885e-08, "logits/chosen": -2.1172738075256348, "logits/rejected": -2.292388677597046, "logps/chosen": -0.9050085544586182, "logps/rejected": -5.672313690185547, "loss": 0.5757, "rewards/accuracies": 1.0, "rewards/chosen": 0.9752287268638611, "rewards/margins": 0.25047188997268677, "rewards/rejected": 0.7247568368911743, "step": 3742 }, { "epoch": 2.02, "learning_rate": 5.1670284474740444e-08, "logits/chosen": -2.0750560760498047, "logits/rejected": -2.0752971172332764, "logps/chosen": -1.5430381298065186, "logps/rejected": -1.3906784057617188, "loss": 0.5505, "rewards/accuracies": 1.0, "rewards/chosen": 1.2107634544372559, "rewards/margins": 0.3090202212333679, "rewards/rejected": 0.9017432332038879, "step": 3743 }, { "epoch": 2.02, "learning_rate": 5.1648458657501173e-08, "logits/chosen": -1.984864354133606, "logits/rejected": -2.053650140762329, "logps/chosen": -4.162680149078369, "logps/rejected": -23.34314727783203, "loss": 0.2604, "rewards/accuracies": 1.0, "rewards/chosen": 1.5641489028930664, "rewards/margins": 1.2123216390609741, "rewards/rejected": 0.3518272340297699, "step": 3744 }, { "epoch": 2.02, "learning_rate": 5.162663252580751e-08, "logits/chosen": -2.0612947940826416, "logits/rejected": -2.2474400997161865, "logps/chosen": -4.225127220153809, "logps/rejected": -5.475074768066406, "loss": 0.5368, "rewards/accuracies": 1.0, "rewards/chosen": 0.9090535044670105, "rewards/margins": 0.3418099880218506, "rewards/rejected": 0.5672435164451599, "step": 3745 }, { "epoch": 2.02, "learning_rate": 5.1604806083822925e-08, "logits/chosen": -2.3057057857513428, "logits/rejected": -2.1805927753448486, "logps/chosen": -29.522903442382812, "logps/rejected": -3.2477426528930664, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": 2.551400899887085, "rewards/margins": 1.6047899723052979, "rewards/rejected": 0.9466108679771423, "step": 3746 }, { "epoch": 2.02, "learning_rate": 5.1582979335710976e-08, "logits/chosen": -2.120429515838623, "logits/rejected": -2.125844717025757, "logps/chosen": -2.8109514713287354, "logps/rejected": -1.4300041198730469, "loss": 0.5805, "rewards/accuracies": 1.0, "rewards/chosen": 1.4129561185836792, "rewards/margins": 0.23960256576538086, "rewards/rejected": 1.1733535528182983, "step": 3747 }, { "epoch": 2.02, "learning_rate": 5.156115228563521e-08, "logits/chosen": -2.006124973297119, "logits/rejected": -2.0167200565338135, "logps/chosen": -1.5436038970947266, "logps/rejected": -2.2921321392059326, "loss": 0.4549, "rewards/accuracies": 1.0, "rewards/chosen": 1.2502973079681396, "rewards/margins": 0.5516853928565979, "rewards/rejected": 0.6986119151115417, "step": 3748 }, { "epoch": 2.02, "learning_rate": 5.1539324937759334e-08, "logits/chosen": -2.195436954498291, "logits/rejected": -2.348970413208008, "logps/chosen": -16.37575912475586, "logps/rejected": -4.056329250335693, "loss": 0.7049, "rewards/accuracies": 0.0, "rewards/chosen": 0.8862270712852478, "rewards/margins": -0.023282229900360107, "rewards/rejected": 0.9095093011856079, "step": 3749 }, { "epoch": 2.02, "learning_rate": 5.1517497296247e-08, "logits/chosen": -1.9947023391723633, "logits/rejected": -2.0019710063934326, "logps/chosen": -1.9331986904144287, "logps/rejected": -4.432471752166748, "loss": 0.4286, "rewards/accuracies": 1.0, "rewards/chosen": 1.1191809177398682, "rewards/margins": 0.6252633929252625, "rewards/rejected": 0.4939175248146057, "step": 3750 }, { "epoch": 2.02, "learning_rate": 5.149566936526204e-08, "logits/chosen": -2.1818900108337402, "logits/rejected": -2.009225845336914, "logps/chosen": -35.12950134277344, "logps/rejected": -4.6499152183532715, "loss": 0.1873, "rewards/accuracies": 1.0, "rewards/chosen": 2.287731885910034, "rewards/margins": 1.579817295074463, "rewards/rejected": 0.7079145908355713, "step": 3751 }, { "epoch": 2.02, "learning_rate": 5.147384114896825e-08, "logits/chosen": -2.2148964405059814, "logits/rejected": -2.2182672023773193, "logps/chosen": -0.6615829467773438, "logps/rejected": -4.297597885131836, "loss": 0.4883, "rewards/accuracies": 1.0, "rewards/chosen": 1.0394350290298462, "rewards/margins": 0.46284782886505127, "rewards/rejected": 0.5765872001647949, "step": 3752 }, { "epoch": 2.02, "learning_rate": 5.14520126515295e-08, "logits/chosen": -2.02500581741333, "logits/rejected": -2.0379631519317627, "logps/chosen": -1.298789143562317, "logps/rejected": -7.295497894287109, "loss": 0.4152, "rewards/accuracies": 1.0, "rewards/chosen": 1.1455501317977905, "rewards/margins": 0.6642242670059204, "rewards/rejected": 0.48132583498954773, "step": 3753 }, { "epoch": 2.02, "learning_rate": 5.143018387710971e-08, "logits/chosen": -2.152571678161621, "logits/rejected": -2.3233284950256348, "logps/chosen": -22.611234664916992, "logps/rejected": -7.967984199523926, "loss": 0.5085, "rewards/accuracies": 1.0, "rewards/chosen": 1.352591872215271, "rewards/margins": 0.41122519969940186, "rewards/rejected": 0.9413666725158691, "step": 3754 }, { "epoch": 2.03, "learning_rate": 5.14083548298729e-08, "logits/chosen": -2.225769281387329, "logits/rejected": -2.2203831672668457, "logps/chosen": -5.271717548370361, "logps/rejected": -3.0135855674743652, "loss": 0.4098, "rewards/accuracies": 1.0, "rewards/chosen": 1.2319484949111938, "rewards/margins": 0.6800787448883057, "rewards/rejected": 0.5518697500228882, "step": 3755 }, { "epoch": 2.03, "learning_rate": 5.138652551398307e-08, "logits/chosen": -2.065067768096924, "logits/rejected": -2.0566046237945557, "logps/chosen": -11.18626594543457, "logps/rejected": -4.3631911277771, "loss": 0.366, "rewards/accuracies": 1.0, "rewards/chosen": 1.236174464225769, "rewards/margins": 0.8166364431381226, "rewards/rejected": 0.4195379912853241, "step": 3756 }, { "epoch": 2.03, "learning_rate": 5.136469593360433e-08, "logits/chosen": -1.9981732368469238, "logits/rejected": -2.302069664001465, "logps/chosen": -3.1727466583251953, "logps/rejected": -0.43763598799705505, "loss": 0.734, "rewards/accuracies": 0.0, "rewards/chosen": 0.8873879313468933, "rewards/margins": -0.08019185066223145, "rewards/rejected": 0.9675797820091248, "step": 3757 }, { "epoch": 2.03, "learning_rate": 5.1342866092900796e-08, "logits/chosen": -2.0806145668029785, "logits/rejected": -2.2825558185577393, "logps/chosen": -0.3632272481918335, "logps/rejected": -0.37684547901153564, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.9055620431900024, "rewards/margins": 0.009091794490814209, "rewards/rejected": 0.8964702486991882, "step": 3758 }, { "epoch": 2.03, "learning_rate": 5.132103599603668e-08, "logits/chosen": -1.9463300704956055, "logits/rejected": -2.3336002826690674, "logps/chosen": -5.353248119354248, "logps/rejected": -5.63527774810791, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.49385571479797363, "rewards/margins": 0.0042991042137146, "rewards/rejected": 0.48955661058425903, "step": 3759 }, { "epoch": 2.03, "learning_rate": 5.129920564717618e-08, "logits/chosen": -2.1548829078674316, "logits/rejected": -2.30126953125, "logps/chosen": -1.3716225624084473, "logps/rejected": -4.094202041625977, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.9285385012626648, "rewards/margins": 0.00617063045501709, "rewards/rejected": 0.9223678708076477, "step": 3760 }, { "epoch": 2.03, "learning_rate": 5.127737505048361e-08, "logits/chosen": -2.0761070251464844, "logits/rejected": -2.2719411849975586, "logps/chosen": -0.678924024105072, "logps/rejected": -0.6837059855461121, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.9024228453636169, "rewards/margins": 0.006551980972290039, "rewards/rejected": 0.8958708643913269, "step": 3761 }, { "epoch": 2.03, "learning_rate": 5.125554421012332e-08, "logits/chosen": -2.2543554306030273, "logits/rejected": -2.2665696144104004, "logps/chosen": -4.729633331298828, "logps/rejected": -5.395711421966553, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 1.1608459949493408, "rewards/margins": 0.6749482154846191, "rewards/rejected": 0.4858977794647217, "step": 3762 }, { "epoch": 2.03, "learning_rate": 5.1233713130259624e-08, "logits/chosen": -2.127232789993286, "logits/rejected": -2.1133720874786377, "logps/chosen": -7.164337635040283, "logps/rejected": -4.428662300109863, "loss": 0.3821, "rewards/accuracies": 1.0, "rewards/chosen": 1.227317452430725, "rewards/margins": 0.7649251222610474, "rewards/rejected": 0.46239233016967773, "step": 3763 }, { "epoch": 2.03, "learning_rate": 5.1211881815057e-08, "logits/chosen": -2.1190407276153564, "logits/rejected": -2.0873568058013916, "logps/chosen": -10.85729694366455, "logps/rejected": -2.2903008460998535, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 1.7739921808242798, "rewards/margins": 1.1248705387115479, "rewards/rejected": 0.6491217017173767, "step": 3764 }, { "epoch": 2.03, "learning_rate": 5.1190050268679896e-08, "logits/chosen": -2.1516520977020264, "logits/rejected": -2.1988627910614014, "logps/chosen": -5.421881675720215, "logps/rejected": -22.611934661865234, "loss": 0.4177, "rewards/accuracies": 1.0, "rewards/chosen": 1.338802695274353, "rewards/margins": 0.6569282412528992, "rewards/rejected": 0.6818744540214539, "step": 3765 }, { "epoch": 2.03, "learning_rate": 5.1168218495292823e-08, "logits/chosen": -2.0180954933166504, "logits/rejected": -2.25962233543396, "logps/chosen": -0.2740834355354309, "logps/rejected": -0.29287922382354736, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.9906004071235657, "rewards/margins": -0.0031723976135253906, "rewards/rejected": 0.9937728047370911, "step": 3766 }, { "epoch": 2.03, "learning_rate": 5.1146386499060335e-08, "logits/chosen": -2.1129813194274902, "logits/rejected": -2.080922842025757, "logps/chosen": -5.3696393966674805, "logps/rejected": -5.226140975952148, "loss": 0.4011, "rewards/accuracies": 1.0, "rewards/chosen": 1.1970280408859253, "rewards/margins": 0.7062939405441284, "rewards/rejected": 0.4907341003417969, "step": 3767 }, { "epoch": 2.03, "learning_rate": 5.1124554284147035e-08, "logits/chosen": -2.136547088623047, "logits/rejected": -2.1241893768310547, "logps/chosen": -4.996170997619629, "logps/rejected": -8.909674644470215, "loss": 0.4784, "rewards/accuracies": 1.0, "rewards/chosen": 1.0399659872055054, "rewards/margins": 0.48871731758117676, "rewards/rejected": 0.5512486696243286, "step": 3768 }, { "epoch": 2.03, "learning_rate": 5.1102721854717566e-08, "logits/chosen": -2.1115920543670654, "logits/rejected": -2.1108250617980957, "logps/chosen": -0.6354750990867615, "logps/rejected": -1.4782123565673828, "loss": 0.6129, "rewards/accuracies": 1.0, "rewards/chosen": 1.0082151889801025, "rewards/margins": 0.16752773523330688, "rewards/rejected": 0.8406874537467957, "step": 3769 }, { "epoch": 2.03, "learning_rate": 5.108088921493661e-08, "logits/chosen": -2.1592013835906982, "logits/rejected": -2.321063995361328, "logps/chosen": -0.9568681716918945, "logps/rejected": -0.962879478931427, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9741736650466919, "rewards/margins": 0.017028987407684326, "rewards/rejected": 0.9571446776390076, "step": 3770 }, { "epoch": 2.03, "learning_rate": 5.105905636896888e-08, "logits/chosen": -1.9871501922607422, "logits/rejected": -2.254979133605957, "logps/chosen": -13.778348922729492, "logps/rejected": -0.47443684935569763, "loss": 0.4691, "rewards/accuracies": 1.0, "rewards/chosen": 1.3344255685806274, "rewards/margins": 0.5133244395256042, "rewards/rejected": 0.8211011290550232, "step": 3771 }, { "epoch": 2.03, "learning_rate": 5.1037223320979116e-08, "logits/chosen": -2.020219087600708, "logits/rejected": -2.3003804683685303, "logps/chosen": -2.367978572845459, "logps/rejected": -2.2049286365509033, "loss": 0.7047, "rewards/accuracies": 0.0, "rewards/chosen": 0.6102328896522522, "rewards/margins": -0.022969603538513184, "rewards/rejected": 0.6332024931907654, "step": 3772 }, { "epoch": 2.04, "learning_rate": 5.101539007513215e-08, "logits/chosen": -2.0980422496795654, "logits/rejected": -2.1019644737243652, "logps/chosen": -1.7314990758895874, "logps/rejected": -3.522031545639038, "loss": 0.4225, "rewards/accuracies": 1.0, "rewards/chosen": 1.3739233016967773, "rewards/margins": 0.6429680585861206, "rewards/rejected": 0.7309552431106567, "step": 3773 }, { "epoch": 2.04, "learning_rate": 5.09935566355928e-08, "logits/chosen": -2.1203391551971436, "logits/rejected": -2.1450672149658203, "logps/chosen": -1.3160778284072876, "logps/rejected": -7.1167311668396, "loss": 0.4208, "rewards/accuracies": 1.0, "rewards/chosen": 1.2586690187454224, "rewards/margins": 0.6479750275611877, "rewards/rejected": 0.6106939911842346, "step": 3774 }, { "epoch": 2.04, "learning_rate": 5.0971723006525955e-08, "logits/chosen": -2.0175092220306396, "logits/rejected": -1.9837802648544312, "logps/chosen": -13.154385566711426, "logps/rejected": -2.6043317317962646, "loss": 0.3147, "rewards/accuracies": 1.0, "rewards/chosen": 1.7708557844161987, "rewards/margins": 0.994762122631073, "rewards/rejected": 0.7760936617851257, "step": 3775 }, { "epoch": 2.04, "learning_rate": 5.0949889192096515e-08, "logits/chosen": -1.9652245044708252, "logits/rejected": -2.2683095932006836, "logps/chosen": -2.049346685409546, "logps/rejected": -2.010760545730591, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.7085588574409485, "rewards/margins": -0.011472582817077637, "rewards/rejected": 0.7200314402580261, "step": 3776 }, { "epoch": 2.04, "learning_rate": 5.092805519646942e-08, "logits/chosen": -2.1339869499206543, "logits/rejected": -2.3052024841308594, "logps/chosen": -2.9235804080963135, "logps/rejected": -2.942227602005005, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8036112189292908, "rewards/margins": 0.012212038040161133, "rewards/rejected": 0.7913991808891296, "step": 3777 }, { "epoch": 2.04, "learning_rate": 5.0906221023809646e-08, "logits/chosen": -1.9757657051086426, "logits/rejected": -1.9802749156951904, "logps/chosen": -0.7382745742797852, "logps/rejected": -4.287261962890625, "loss": 0.5956, "rewards/accuracies": 1.0, "rewards/chosen": 1.2213996648788452, "rewards/margins": 0.20554053783416748, "rewards/rejected": 1.0158591270446777, "step": 3778 }, { "epoch": 2.04, "learning_rate": 5.088438667828222e-08, "logits/chosen": -2.1762592792510986, "logits/rejected": -2.1815199851989746, "logps/chosen": -1.9642789363861084, "logps/rejected": -4.4465508460998535, "loss": 0.472, "rewards/accuracies": 1.0, "rewards/chosen": 1.1469767093658447, "rewards/margins": 0.5054221749305725, "rewards/rejected": 0.6415545344352722, "step": 3779 }, { "epoch": 2.04, "learning_rate": 5.086255216405218e-08, "logits/chosen": -2.0373313426971436, "logits/rejected": -2.0377280712127686, "logps/chosen": -0.2449115365743637, "logps/rejected": -2.878988027572632, "loss": 0.5267, "rewards/accuracies": 1.0, "rewards/chosen": 0.8853154182434082, "rewards/margins": 0.36622291803359985, "rewards/rejected": 0.5190925002098083, "step": 3780 }, { "epoch": 2.04, "learning_rate": 5.0840717485284615e-08, "logits/chosen": -2.0613508224487305, "logits/rejected": -2.0547120571136475, "logps/chosen": -7.495209693908691, "logps/rejected": -8.390579223632812, "loss": 0.3968, "rewards/accuracies": 1.0, "rewards/chosen": 1.1451624631881714, "rewards/margins": 0.7194390892982483, "rewards/rejected": 0.4257233738899231, "step": 3781 }, { "epoch": 2.04, "learning_rate": 5.0818882646144614e-08, "logits/chosen": -2.030763864517212, "logits/rejected": -2.318493366241455, "logps/chosen": -7.245113372802734, "logps/rejected": -7.084011077880859, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.6589931845664978, "rewards/margins": 0.006566286087036133, "rewards/rejected": 0.6524268984794617, "step": 3782 }, { "epoch": 2.04, "learning_rate": 5.079704765079732e-08, "logits/chosen": -2.096238851547241, "logits/rejected": -2.294069290161133, "logps/chosen": -0.9108461141586304, "logps/rejected": -0.9949739575386047, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0175784826278687, "rewards/margins": 0.011224746704101562, "rewards/rejected": 1.006353735923767, "step": 3783 }, { "epoch": 2.04, "learning_rate": 5.0775212503407904e-08, "logits/chosen": -2.1583592891693115, "logits/rejected": -2.3084876537323, "logps/chosen": -1.9089879989624023, "logps/rejected": -0.584010124206543, "loss": 0.7263, "rewards/accuracies": 0.0, "rewards/chosen": 0.9081401228904724, "rewards/margins": -0.06516176462173462, "rewards/rejected": 0.973301887512207, "step": 3784 }, { "epoch": 2.04, "learning_rate": 5.07533772081416e-08, "logits/chosen": -2.0773513317108154, "logits/rejected": -2.0793657302856445, "logps/chosen": -3.833890914916992, "logps/rejected": -3.4153194427490234, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 1.6904973983764648, "rewards/margins": 0.9920759201049805, "rewards/rejected": 0.6984214782714844, "step": 3785 }, { "epoch": 2.04, "learning_rate": 5.073154176916359e-08, "logits/chosen": -2.0428214073181152, "logits/rejected": -2.044339179992676, "logps/chosen": -1.9930548667907715, "logps/rejected": -1.3284271955490112, "loss": 0.4149, "rewards/accuracies": 1.0, "rewards/chosen": 1.579530119895935, "rewards/margins": 0.6650408506393433, "rewards/rejected": 0.9144892692565918, "step": 3786 }, { "epoch": 2.04, "learning_rate": 5.0709706190639146e-08, "logits/chosen": -2.1164660453796387, "logits/rejected": -2.12595796585083, "logps/chosen": -1.2713356018066406, "logps/rejected": -2.126206398010254, "loss": 0.493, "rewards/accuracies": 1.0, "rewards/chosen": 1.2154020071029663, "rewards/margins": 0.4507599472999573, "rewards/rejected": 0.764642059803009, "step": 3787 }, { "epoch": 2.04, "learning_rate": 5.068787047673355e-08, "logits/chosen": -2.1632354259490967, "logits/rejected": -2.1585805416107178, "logps/chosen": -2.4512271881103516, "logps/rejected": -6.045941352844238, "loss": 0.5783, "rewards/accuracies": 1.0, "rewards/chosen": 0.946921169757843, "rewards/margins": 0.24470126628875732, "rewards/rejected": 0.7022199034690857, "step": 3788 }, { "epoch": 2.04, "learning_rate": 5.0666034631612096e-08, "logits/chosen": -2.1571598052978516, "logits/rejected": -2.261323928833008, "logps/chosen": -4.180140495300293, "logps/rejected": -2.063415765762329, "loss": 0.6192, "rewards/accuracies": 1.0, "rewards/chosen": 1.0542988777160645, "rewards/margins": 0.15386587381362915, "rewards/rejected": 0.9004330039024353, "step": 3789 }, { "epoch": 2.04, "learning_rate": 5.064419865944014e-08, "logits/chosen": -2.1655571460723877, "logits/rejected": -2.358049154281616, "logps/chosen": -1.1983659267425537, "logps/rejected": -1.1711310148239136, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 1.0393165349960327, "rewards/margins": 0.009561419486999512, "rewards/rejected": 1.0297551155090332, "step": 3790 }, { "epoch": 2.04, "learning_rate": 5.062236256438303e-08, "logits/chosen": -2.122915506362915, "logits/rejected": -2.3364429473876953, "logps/chosen": -0.5731391310691833, "logps/rejected": -0.6302870512008667, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0390533208847046, "rewards/margins": 0.011180281639099121, "rewards/rejected": 1.0278730392456055, "step": 3791 }, { "epoch": 2.05, "learning_rate": 5.060052635060612e-08, "logits/chosen": -2.012802839279175, "logits/rejected": -2.0058584213256836, "logps/chosen": -2.8945791721343994, "logps/rejected": -6.027594089508057, "loss": 0.2785, "rewards/accuracies": 1.0, "rewards/chosen": 1.4504728317260742, "rewards/margins": 1.1358311176300049, "rewards/rejected": 0.31464171409606934, "step": 3792 }, { "epoch": 2.05, "learning_rate": 5.057869002227485e-08, "logits/chosen": -2.006035327911377, "logits/rejected": -2.2876245975494385, "logps/chosen": -0.5157460570335388, "logps/rejected": -0.6031482219696045, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": 0.9037863612174988, "rewards/margins": -0.01861405372619629, "rewards/rejected": 0.9224004149436951, "step": 3793 }, { "epoch": 2.05, "learning_rate": 5.055685358355464e-08, "logits/chosen": -2.0353291034698486, "logits/rejected": -2.3614919185638428, "logps/chosen": -0.65767502784729, "logps/rejected": -4.759917259216309, "loss": 0.6373, "rewards/accuracies": 1.0, "rewards/chosen": 1.0572993755340576, "rewards/margins": 0.11502116918563843, "rewards/rejected": 0.9422782063484192, "step": 3794 }, { "epoch": 2.05, "learning_rate": 5.0535017038610896e-08, "logits/chosen": -2.180887222290039, "logits/rejected": -2.1790575981140137, "logps/chosen": -1.0290892124176025, "logps/rejected": -6.177365303039551, "loss": 0.4471, "rewards/accuracies": 1.0, "rewards/chosen": 1.0291990041732788, "rewards/margins": 0.5729866027832031, "rewards/rejected": 0.45621243119239807, "step": 3795 }, { "epoch": 2.05, "learning_rate": 5.051318039160912e-08, "logits/chosen": -2.1463308334350586, "logits/rejected": -2.274172782897949, "logps/chosen": -7.6246466636657715, "logps/rejected": -4.38749361038208, "loss": 0.7334, "rewards/accuracies": 0.0, "rewards/chosen": 0.8012180328369141, "rewards/margins": -0.07901346683502197, "rewards/rejected": 0.880231499671936, "step": 3796 }, { "epoch": 2.05, "learning_rate": 5.049134364671479e-08, "logits/chosen": -2.0663394927978516, "logits/rejected": -2.214892864227295, "logps/chosen": -0.8852037787437439, "logps/rejected": -0.8130180835723877, "loss": 0.7005, "rewards/accuracies": 0.0, "rewards/chosen": 0.9823047518730164, "rewards/margins": -0.014568805694580078, "rewards/rejected": 0.9968735575675964, "step": 3797 }, { "epoch": 2.05, "learning_rate": 5.0469506808093396e-08, "logits/chosen": -2.1026687622070312, "logits/rejected": -2.109318971633911, "logps/chosen": -2.0546534061431885, "logps/rejected": -3.7702982425689697, "loss": 0.4564, "rewards/accuracies": 1.0, "rewards/chosen": 1.0415029525756836, "rewards/margins": 0.5476381182670593, "rewards/rejected": 0.49386483430862427, "step": 3798 }, { "epoch": 2.05, "learning_rate": 5.044766987991049e-08, "logits/chosen": -2.092250108718872, "logits/rejected": -2.319457769393921, "logps/chosen": -3.430206298828125, "logps/rejected": -3.5734219551086426, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 1.4118530750274658, "rewards/margins": 0.01604139804840088, "rewards/rejected": 1.395811676979065, "step": 3799 }, { "epoch": 2.05, "learning_rate": 5.042583286633158e-08, "logits/chosen": -1.9797474145889282, "logits/rejected": -2.2850470542907715, "logps/chosen": -0.8526270985603333, "logps/rejected": -7.800903797149658, "loss": 0.5118, "rewards/accuracies": 1.0, "rewards/chosen": 0.8548002243041992, "rewards/margins": 0.40293240547180176, "rewards/rejected": 0.45186781883239746, "step": 3800 }, { "epoch": 2.05, "learning_rate": 5.040399577152224e-08, "logits/chosen": -2.1329832077026367, "logits/rejected": -2.1401121616363525, "logps/chosen": -3.657946825027466, "logps/rejected": -2.7656164169311523, "loss": 0.425, "rewards/accuracies": 1.0, "rewards/chosen": 1.1804218292236328, "rewards/margins": 0.6356049180030823, "rewards/rejected": 0.5448169112205505, "step": 3801 }, { "epoch": 2.05, "learning_rate": 5.038215859964802e-08, "logits/chosen": -2.1085877418518066, "logits/rejected": -2.1196560859680176, "logps/chosen": -0.48766571283340454, "logps/rejected": -6.761869430541992, "loss": 0.5532, "rewards/accuracies": 1.0, "rewards/chosen": 1.1055630445480347, "rewards/margins": 0.30280452966690063, "rewards/rejected": 0.802758514881134, "step": 3802 }, { "epoch": 2.05, "learning_rate": 5.0360321354874514e-08, "logits/chosen": -2.081679105758667, "logits/rejected": -2.1917037963867188, "logps/chosen": -0.993486762046814, "logps/rejected": -1.005388855934143, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.7965342998504639, "rewards/margins": 0.039048969745635986, "rewards/rejected": 0.7574853301048279, "step": 3803 }, { "epoch": 2.05, "learning_rate": 5.0338484041367335e-08, "logits/chosen": -2.0134358406066895, "logits/rejected": -2.0246312618255615, "logps/chosen": -1.3263002634048462, "logps/rejected": -3.3090457916259766, "loss": 0.4779, "rewards/accuracies": 1.0, "rewards/chosen": 1.0940749645233154, "rewards/margins": 0.48995691537857056, "rewards/rejected": 0.6041180491447449, "step": 3804 }, { "epoch": 2.05, "learning_rate": 5.0316646663292085e-08, "logits/chosen": -2.1072463989257812, "logits/rejected": -2.2942285537719727, "logps/chosen": -0.41029027104377747, "logps/rejected": -0.4825644791126251, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8770844340324402, "rewards/margins": 0.013538062572479248, "rewards/rejected": 0.8635463714599609, "step": 3805 }, { "epoch": 2.05, "learning_rate": 5.029480922481437e-08, "logits/chosen": -2.245025873184204, "logits/rejected": -2.244675636291504, "logps/chosen": -1.7383320331573486, "logps/rejected": -1.3473103046417236, "loss": 0.549, "rewards/accuracies": 1.0, "rewards/chosen": 0.9964348673820496, "rewards/margins": 0.312613844871521, "rewards/rejected": 0.6838210225105286, "step": 3806 }, { "epoch": 2.05, "learning_rate": 5.0272971730099854e-08, "logits/chosen": -2.1594316959381104, "logits/rejected": -2.270902395248413, "logps/chosen": -9.254033088684082, "logps/rejected": -9.090165138244629, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 0.9099417924880981, "rewards/margins": -0.007366180419921875, "rewards/rejected": 0.91730797290802, "step": 3807 }, { "epoch": 2.05, "learning_rate": 5.025113418331419e-08, "logits/chosen": -2.2214157581329346, "logits/rejected": -2.2167954444885254, "logps/chosen": -0.16036731004714966, "logps/rejected": -12.053901672363281, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427005052566528, "rewards/margins": 0.9445565342903137, "rewards/rejected": -0.0018560410244390368, "step": 3808 }, { "epoch": 2.05, "learning_rate": 5.0229296588623015e-08, "logits/chosen": -2.1641573905944824, "logits/rejected": -2.2565717697143555, "logps/chosen": -1.2754075527191162, "logps/rejected": -1.2594668865203857, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9128004312515259, "rewards/margins": 0.022243201732635498, "rewards/rejected": 0.8905572295188904, "step": 3809 }, { "epoch": 2.06, "learning_rate": 5.0207458950191986e-08, "logits/chosen": -2.075981378555298, "logits/rejected": -2.079922676086426, "logps/chosen": -0.645675539970398, "logps/rejected": -3.650710344314575, "loss": 0.4685, "rewards/accuracies": 1.0, "rewards/chosen": 0.951039731502533, "rewards/margins": 0.5149132013320923, "rewards/rejected": 0.4361265301704407, "step": 3810 }, { "epoch": 2.06, "learning_rate": 5.0185621272186797e-08, "logits/chosen": -2.0470616817474365, "logits/rejected": -2.257963180541992, "logps/chosen": -0.48186397552490234, "logps/rejected": -0.5916088223457336, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.9820317625999451, "rewards/margins": -0.00268477201461792, "rewards/rejected": 0.984716534614563, "step": 3811 }, { "epoch": 2.06, "learning_rate": 5.0163783558773095e-08, "logits/chosen": -2.1164958477020264, "logits/rejected": -2.247803211212158, "logps/chosen": -4.854415416717529, "logps/rejected": -4.238906383514404, "loss": 0.716, "rewards/accuracies": 0.0, "rewards/chosen": 0.8671463131904602, "rewards/margins": -0.04513818025588989, "rewards/rejected": 0.9122844934463501, "step": 3812 }, { "epoch": 2.06, "learning_rate": 5.0141945814116624e-08, "logits/chosen": -2.058302640914917, "logits/rejected": -2.0660345554351807, "logps/chosen": -0.40004125237464905, "logps/rejected": -8.158796310424805, "loss": 0.3717, "rewards/accuracies": 1.0, "rewards/chosen": 1.0283336639404297, "rewards/margins": 0.7979657053947449, "rewards/rejected": 0.23036794364452362, "step": 3813 }, { "epoch": 2.06, "learning_rate": 5.012010804238304e-08, "logits/chosen": -2.2296195030212402, "logits/rejected": -2.10318922996521, "logps/chosen": -21.100183486938477, "logps/rejected": -11.813879013061523, "loss": 0.2285, "rewards/accuracies": 1.0, "rewards/chosen": 1.8305937051773071, "rewards/margins": 1.3596887588500977, "rewards/rejected": 0.4709049165248871, "step": 3814 }, { "epoch": 2.06, "learning_rate": 5.009827024773805e-08, "logits/chosen": -2.0844106674194336, "logits/rejected": -2.295518159866333, "logps/chosen": -4.914404392242432, "logps/rejected": -0.8007165193557739, "loss": 0.9207, "rewards/accuracies": 0.0, "rewards/chosen": 0.6971922516822815, "rewards/margins": -0.41286224126815796, "rewards/rejected": 1.1100544929504395, "step": 3815 }, { "epoch": 2.06, "learning_rate": 5.007643243434737e-08, "logits/chosen": -2.1595816612243652, "logits/rejected": -2.162456750869751, "logps/chosen": -2.8227055072784424, "logps/rejected": -6.200497150421143, "loss": 0.32, "rewards/accuracies": 1.0, "rewards/chosen": 1.2455527782440186, "rewards/margins": 0.975256621837616, "rewards/rejected": 0.2702961564064026, "step": 3816 }, { "epoch": 2.06, "learning_rate": 5.005459460637669e-08, "logits/chosen": -2.106912612915039, "logits/rejected": -2.1066441535949707, "logps/chosen": -1.456216812133789, "logps/rejected": -1.884716272354126, "loss": 0.559, "rewards/accuracies": 1.0, "rewards/chosen": 1.1064907312393188, "rewards/margins": 0.2891055941581726, "rewards/rejected": 0.8173851370811462, "step": 3817 }, { "epoch": 2.06, "learning_rate": 5.003275676799172e-08, "logits/chosen": -2.12062668800354, "logits/rejected": -2.119554281234741, "logps/chosen": -3.665297746658325, "logps/rejected": -2.225607395172119, "loss": 0.2419, "rewards/accuracies": 1.0, "rewards/chosen": 1.896052598953247, "rewards/margins": 1.2958812713623047, "rewards/rejected": 0.6001712679862976, "step": 3818 }, { "epoch": 2.06, "learning_rate": 5.001091892335819e-08, "logits/chosen": -1.9917535781860352, "logits/rejected": -2.251316785812378, "logps/chosen": -0.9800900220870972, "logps/rejected": -0.8809318542480469, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.7398971915245056, "rewards/margins": 0.014944672584533691, "rewards/rejected": 0.7249525189399719, "step": 3819 }, { "epoch": 2.06, "learning_rate": 4.998908107664179e-08, "logits/chosen": -2.083505392074585, "logits/rejected": -2.0704891681671143, "logps/chosen": -3.817547559738159, "logps/rejected": -6.582241535186768, "loss": 0.2594, "rewards/accuracies": 1.0, "rewards/chosen": 1.487286925315857, "rewards/margins": 1.2168457508087158, "rewards/rejected": 0.27044111490249634, "step": 3820 }, { "epoch": 2.06, "learning_rate": 4.996724323200826e-08, "logits/chosen": -2.160884141921997, "logits/rejected": -2.3425843715667725, "logps/chosen": -0.7290765047073364, "logps/rejected": -0.7278666496276855, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.9534082412719727, "rewards/margins": -0.002682328224182129, "rewards/rejected": 0.9560905694961548, "step": 3821 }, { "epoch": 2.06, "learning_rate": 4.99454053936233e-08, "logits/chosen": -2.212979793548584, "logits/rejected": -2.210928440093994, "logps/chosen": -0.9226845502853394, "logps/rejected": -7.390662670135498, "loss": 0.3981, "rewards/accuracies": 1.0, "rewards/chosen": 1.0516437292099, "rewards/margins": 0.7154205441474915, "rewards/rejected": 0.33622318506240845, "step": 3822 }, { "epoch": 2.06, "learning_rate": 4.9923567565652634e-08, "logits/chosen": -2.1857616901397705, "logits/rejected": -2.183858633041382, "logps/chosen": -5.404240131378174, "logps/rejected": -6.474850177764893, "loss": 0.2523, "rewards/accuracies": 1.0, "rewards/chosen": 1.5085970163345337, "rewards/margins": 1.2481623888015747, "rewards/rejected": 0.2604345977306366, "step": 3823 }, { "epoch": 2.06, "learning_rate": 4.990172975226195e-08, "logits/chosen": -2.0988659858703613, "logits/rejected": -2.2635385990142822, "logps/chosen": -0.3644249737262726, "logps/rejected": -7.191684722900391, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 0.8856502771377563, "rewards/margins": 0.13254255056381226, "rewards/rejected": 0.7531077265739441, "step": 3824 }, { "epoch": 2.06, "learning_rate": 4.987989195761696e-08, "logits/chosen": -2.0323920249938965, "logits/rejected": -2.287663459777832, "logps/chosen": -0.7463437914848328, "logps/rejected": -0.701971709728241, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 1.0554262399673462, "rewards/margins": 0.024457931518554688, "rewards/rejected": 1.0309683084487915, "step": 3825 }, { "epoch": 2.06, "learning_rate": 4.985805418588338e-08, "logits/chosen": -1.9788380861282349, "logits/rejected": -2.241398334503174, "logps/chosen": -1.3371310234069824, "logps/rejected": -4.163614273071289, "loss": 0.6518, "rewards/accuracies": 1.0, "rewards/chosen": 1.0373789072036743, "rewards/margins": 0.08445757627487183, "rewards/rejected": 0.9529213309288025, "step": 3826 }, { "epoch": 2.06, "learning_rate": 4.98362164412269e-08, "logits/chosen": -1.9825199842453003, "logits/rejected": -1.9824731349945068, "logps/chosen": -2.847790479660034, "logps/rejected": -2.819159746170044, "loss": 0.429, "rewards/accuracies": 1.0, "rewards/chosen": 1.338614583015442, "rewards/margins": 0.624241292476654, "rewards/rejected": 0.7143732905387878, "step": 3827 }, { "epoch": 2.06, "learning_rate": 4.9814378727813225e-08, "logits/chosen": -2.064936399459839, "logits/rejected": -2.2708210945129395, "logps/chosen": -0.26749473810195923, "logps/rejected": -0.2976479232311249, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 1.031219720840454, "rewards/margins": 0.028075098991394043, "rewards/rejected": 1.00314462184906, "step": 3828 }, { "epoch": 2.07, "learning_rate": 4.979254104980802e-08, "logits/chosen": -2.1098930835723877, "logits/rejected": -2.1107640266418457, "logps/chosen": -3.6877365112304688, "logps/rejected": -3.356492042541504, "loss": 0.4415, "rewards/accuracies": 1.0, "rewards/chosen": 1.1924927234649658, "rewards/margins": 0.5887194275856018, "rewards/rejected": 0.603773295879364, "step": 3829 }, { "epoch": 2.07, "learning_rate": 4.9770703411377e-08, "logits/chosen": -1.965718388557434, "logits/rejected": -1.9654351472854614, "logps/chosen": -0.27562230825424194, "logps/rejected": -6.870606422424316, "loss": 0.4451, "rewards/accuracies": 1.0, "rewards/chosen": 0.9707131385803223, "rewards/margins": 0.5787605047225952, "rewards/rejected": 0.39195260405540466, "step": 3830 }, { "epoch": 2.07, "learning_rate": 4.9748865816685806e-08, "logits/chosen": -2.042609930038452, "logits/rejected": -2.0416154861450195, "logps/chosen": -6.540424346923828, "logps/rejected": -6.600940704345703, "loss": 0.4248, "rewards/accuracies": 1.0, "rewards/chosen": 1.066280722618103, "rewards/margins": 0.6362533569335938, "rewards/rejected": 0.43002739548683167, "step": 3831 }, { "epoch": 2.07, "learning_rate": 4.972702826990013e-08, "logits/chosen": -1.9337257146835327, "logits/rejected": -1.93765389919281, "logps/chosen": -1.436411738395691, "logps/rejected": -3.3375725746154785, "loss": 0.4974, "rewards/accuracies": 1.0, "rewards/chosen": 1.0556020736694336, "rewards/margins": 0.43947547674179077, "rewards/rejected": 0.6161265969276428, "step": 3832 }, { "epoch": 2.07, "learning_rate": 4.9705190775185616e-08, "logits/chosen": -2.0956902503967285, "logits/rejected": -2.2204694747924805, "logps/chosen": -1.4990105628967285, "logps/rejected": -1.4827404022216797, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.6771714687347412, "rewards/margins": 0.028875768184661865, "rewards/rejected": 0.6482957005500793, "step": 3833 }, { "epoch": 2.07, "learning_rate": 4.968335333670792e-08, "logits/chosen": -2.1336328983306885, "logits/rejected": -2.1336822509765625, "logps/chosen": -0.1639404296875, "logps/rejected": -5.1226091384887695, "loss": 0.5072, "rewards/accuracies": 1.0, "rewards/chosen": 0.8463758826255798, "rewards/margins": 0.414580374956131, "rewards/rejected": 0.43179550766944885, "step": 3834 }, { "epoch": 2.07, "learning_rate": 4.966151595863267e-08, "logits/chosen": -2.111161708831787, "logits/rejected": -2.2918875217437744, "logps/chosen": -0.2825610637664795, "logps/rejected": -0.31751710176467896, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.7732081413269043, "rewards/margins": 0.036042749881744385, "rewards/rejected": 0.7371653914451599, "step": 3835 }, { "epoch": 2.07, "learning_rate": 4.963967864512548e-08, "logits/chosen": -2.04551362991333, "logits/rejected": -2.3091909885406494, "logps/chosen": -3.816403865814209, "logps/rejected": -2.4192373752593994, "loss": 0.7123, "rewards/accuracies": 0.0, "rewards/chosen": 0.8428794741630554, "rewards/margins": -0.03793776035308838, "rewards/rejected": 0.8808172345161438, "step": 3836 }, { "epoch": 2.07, "learning_rate": 4.961784140035198e-08, "logits/chosen": -2.0302860736846924, "logits/rejected": -2.0395560264587402, "logps/chosen": -1.268798589706421, "logps/rejected": -3.5585224628448486, "loss": 0.3148, "rewards/accuracies": 1.0, "rewards/chosen": 1.5418075323104858, "rewards/margins": 0.9941408634185791, "rewards/rejected": 0.5476666688919067, "step": 3837 }, { "epoch": 2.07, "learning_rate": 4.959600422847777e-08, "logits/chosen": -2.0225167274475098, "logits/rejected": -2.0286824703216553, "logps/chosen": -0.962651789188385, "logps/rejected": -3.4974288940429688, "loss": 0.4607, "rewards/accuracies": 1.0, "rewards/chosen": 1.0406169891357422, "rewards/margins": 0.5358594655990601, "rewards/rejected": 0.5047575235366821, "step": 3838 }, { "epoch": 2.07, "learning_rate": 4.957416713366842e-08, "logits/chosen": -2.0469565391540527, "logits/rejected": -2.0401182174682617, "logps/chosen": -1.7054290771484375, "logps/rejected": -6.820174217224121, "loss": 0.4831, "rewards/accuracies": 1.0, "rewards/chosen": 1.1371351480484009, "rewards/margins": 0.47615861892700195, "rewards/rejected": 0.6609765291213989, "step": 3839 }, { "epoch": 2.07, "learning_rate": 4.9552330120089524e-08, "logits/chosen": -2.002070426940918, "logits/rejected": -2.0015132427215576, "logps/chosen": -0.7557875514030457, "logps/rejected": -2.8582892417907715, "loss": 0.5236, "rewards/accuracies": 1.0, "rewards/chosen": 1.1502820253372192, "rewards/margins": 0.37377142906188965, "rewards/rejected": 0.7765105962753296, "step": 3840 }, { "epoch": 2.07, "learning_rate": 4.9530493191906606e-08, "logits/chosen": -2.1361899375915527, "logits/rejected": -1.953299880027771, "logps/chosen": -34.967281341552734, "logps/rejected": -3.039260149002075, "loss": 0.1892, "rewards/accuracies": 1.0, "rewards/chosen": 2.2264161109924316, "rewards/margins": 1.5690460205078125, "rewards/rejected": 0.6573700308799744, "step": 3841 }, { "epoch": 2.07, "learning_rate": 4.95086563532852e-08, "logits/chosen": -2.158796787261963, "logits/rejected": -2.296107292175293, "logps/chosen": -0.4113793671131134, "logps/rejected": -0.3966217339038849, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.9481293559074402, "rewards/margins": 0.02180248498916626, "rewards/rejected": 0.9263268709182739, "step": 3842 }, { "epoch": 2.07, "learning_rate": 4.9486819608390874e-08, "logits/chosen": -2.0821306705474854, "logits/rejected": -2.2695107460021973, "logps/chosen": -0.341350257396698, "logps/rejected": -0.4066639542579651, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9515308737754822, "rewards/margins": 0.016574323177337646, "rewards/rejected": 0.9349565505981445, "step": 3843 }, { "epoch": 2.07, "learning_rate": 4.946498296138909e-08, "logits/chosen": -2.0982658863067627, "logits/rejected": -2.2590577602386475, "logps/chosen": -0.41911745071411133, "logps/rejected": -2.995457172393799, "loss": 0.6252, "rewards/accuracies": 1.0, "rewards/chosen": 0.9835880398750305, "rewards/margins": 0.14088326692581177, "rewards/rejected": 0.8427047729492188, "step": 3844 }, { "epoch": 2.07, "learning_rate": 4.9443146416445365e-08, "logits/chosen": -2.068230390548706, "logits/rejected": -2.3978309631347656, "logps/chosen": -0.09258235991001129, "logps/rejected": -0.09672634303569794, "loss": 0.7019, "rewards/accuracies": 0.0, "rewards/chosen": 0.9360661506652832, "rewards/margins": -0.017486393451690674, "rewards/rejected": 0.9535525441169739, "step": 3845 }, { "epoch": 2.07, "learning_rate": 4.9421309977725154e-08, "logits/chosen": -1.943396806716919, "logits/rejected": -2.227471113204956, "logps/chosen": -2.698212146759033, "logps/rejected": -5.695672988891602, "loss": 0.6474, "rewards/accuracies": 1.0, "rewards/chosen": 0.8965717554092407, "rewards/margins": 0.09363174438476562, "rewards/rejected": 0.8029400110244751, "step": 3846 }, { "epoch": 2.07, "learning_rate": 4.9399473649393876e-08, "logits/chosen": -2.058518648147583, "logits/rejected": -2.002103805541992, "logps/chosen": -7.291609764099121, "logps/rejected": -8.266746520996094, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 1.562569499015808, "rewards/margins": 0.8007840514183044, "rewards/rejected": 0.7617854475975037, "step": 3847 }, { "epoch": 2.08, "learning_rate": 4.937763743561698e-08, "logits/chosen": -2.1163883209228516, "logits/rejected": -2.1174182891845703, "logps/chosen": -1.0955564975738525, "logps/rejected": -3.6673693656921387, "loss": 0.4991, "rewards/accuracies": 1.0, "rewards/chosen": 0.9947723746299744, "rewards/margins": 0.4349347949028015, "rewards/rejected": 0.5598375797271729, "step": 3848 }, { "epoch": 2.08, "learning_rate": 4.935580134055986e-08, "logits/chosen": -2.075561285018921, "logits/rejected": -2.2996418476104736, "logps/chosen": -0.6188079714775085, "logps/rejected": -0.7551165223121643, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.7540385127067566, "rewards/margins": 0.035298705101013184, "rewards/rejected": 0.7187398076057434, "step": 3849 }, { "epoch": 2.08, "learning_rate": 4.9333965368387906e-08, "logits/chosen": -2.0756473541259766, "logits/rejected": -2.3105785846710205, "logps/chosen": -6.266434669494629, "logps/rejected": -6.698604106903076, "loss": 0.6633, "rewards/accuracies": 1.0, "rewards/chosen": 0.8373603224754333, "rewards/margins": 0.060685813426971436, "rewards/rejected": 0.7766745090484619, "step": 3850 }, { "epoch": 2.08, "learning_rate": 4.931212952326646e-08, "logits/chosen": -2.1478047370910645, "logits/rejected": -2.2724757194519043, "logps/chosen": -1.2054784297943115, "logps/rejected": -1.2153782844543457, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.8222447633743286, "rewards/margins": 0.030427396297454834, "rewards/rejected": 0.7918173670768738, "step": 3851 }, { "epoch": 2.08, "learning_rate": 4.929029380936086e-08, "logits/chosen": -2.053492546081543, "logits/rejected": -2.0489227771759033, "logps/chosen": -6.689857006072998, "logps/rejected": -5.268601894378662, "loss": 0.255, "rewards/accuracies": 1.0, "rewards/chosen": 1.6407623291015625, "rewards/margins": 1.23606538772583, "rewards/rejected": 0.40469691157341003, "step": 3852 }, { "epoch": 2.08, "learning_rate": 4.926845823083642e-08, "logits/chosen": -2.0682480335235596, "logits/rejected": -2.2937421798706055, "logps/chosen": -8.062705039978027, "logps/rejected": -2.1557304859161377, "loss": 0.7449, "rewards/accuracies": 0.0, "rewards/chosen": 0.9428675770759583, "rewards/margins": -0.10101443529129028, "rewards/rejected": 1.0438820123672485, "step": 3853 }, { "epoch": 2.08, "learning_rate": 4.92466227918584e-08, "logits/chosen": -2.138096809387207, "logits/rejected": -2.2847087383270264, "logps/chosen": -2.0532376766204834, "logps/rejected": -2.1491196155548096, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 0.6979252696037292, "rewards/margins": 0.03389441967010498, "rewards/rejected": 0.6640308499336243, "step": 3854 }, { "epoch": 2.08, "learning_rate": 4.922478749659208e-08, "logits/chosen": -2.0205705165863037, "logits/rejected": -2.2335424423217773, "logps/chosen": -2.2397098541259766, "logps/rejected": -1.9060267210006714, "loss": 0.7011, "rewards/accuracies": 0.0, "rewards/chosen": 0.6743963360786438, "rewards/margins": -0.01583564281463623, "rewards/rejected": 0.69023197889328, "step": 3855 }, { "epoch": 2.08, "learning_rate": 4.920295234920268e-08, "logits/chosen": -1.9764827489852905, "logits/rejected": -2.2414557933807373, "logps/chosen": -0.26812514662742615, "logps/rejected": -0.19858166575431824, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 0.9059091806411743, "rewards/margins": 0.04374927282333374, "rewards/rejected": 0.8621599078178406, "step": 3856 }, { "epoch": 2.08, "learning_rate": 4.9181117353855395e-08, "logits/chosen": -2.0772106647491455, "logits/rejected": -2.3170621395111084, "logps/chosen": -2.298212766647339, "logps/rejected": -7.906350135803223, "loss": 0.5738, "rewards/accuracies": 1.0, "rewards/chosen": 0.803959310054779, "rewards/margins": 0.25490403175354004, "rewards/rejected": 0.549055278301239, "step": 3857 }, { "epoch": 2.08, "learning_rate": 4.9159282514715394e-08, "logits/chosen": -2.1660056114196777, "logits/rejected": -2.0821566581726074, "logps/chosen": -34.59883117675781, "logps/rejected": -8.700855255126953, "loss": 0.245, "rewards/accuracies": 1.0, "rewards/chosen": 2.281587600708008, "rewards/margins": 1.281410574913025, "rewards/rejected": 1.000177025794983, "step": 3858 }, { "epoch": 2.08, "learning_rate": 4.913744783594782e-08, "logits/chosen": -2.1179325580596924, "logits/rejected": -2.122185468673706, "logps/chosen": -0.6922019124031067, "logps/rejected": -4.295045375823975, "loss": 0.4513, "rewards/accuracies": 1.0, "rewards/chosen": 1.031151294708252, "rewards/margins": 0.5614890456199646, "rewards/rejected": 0.46966224908828735, "step": 3859 }, { "epoch": 2.08, "learning_rate": 4.911561332171778e-08, "logits/chosen": -2.078235626220703, "logits/rejected": -2.252549886703491, "logps/chosen": -0.8982992768287659, "logps/rejected": -0.941286563873291, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.8750142455101013, "rewards/margins": 0.008380651473999023, "rewards/rejected": 0.8666335940361023, "step": 3860 }, { "epoch": 2.08, "learning_rate": 4.9093778976190356e-08, "logits/chosen": -2.0505239963531494, "logits/rejected": -2.0568230152130127, "logps/chosen": -1.6085295677185059, "logps/rejected": -2.488887310028076, "loss": 0.4504, "rewards/accuracies": 1.0, "rewards/chosen": 1.2007337808609009, "rewards/margins": 0.5640270709991455, "rewards/rejected": 0.6367067098617554, "step": 3861 }, { "epoch": 2.08, "learning_rate": 4.9071944803530585e-08, "logits/chosen": -2.0644805431365967, "logits/rejected": -2.3166284561157227, "logps/chosen": -4.4412384033203125, "logps/rejected": -4.371940612792969, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 1.149013638496399, "rewards/margins": 0.00927281379699707, "rewards/rejected": 1.1397408246994019, "step": 3862 }, { "epoch": 2.08, "learning_rate": 4.905011080790349e-08, "logits/chosen": -2.136427879333496, "logits/rejected": -2.342012643814087, "logps/chosen": -0.49870750308036804, "logps/rejected": -0.5318432450294495, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9470596313476562, "rewards/margins": 0.010277152061462402, "rewards/rejected": 0.9367824792861938, "step": 3863 }, { "epoch": 2.08, "learning_rate": 4.9028276993474054e-08, "logits/chosen": -1.9991077184677124, "logits/rejected": -2.0000417232513428, "logps/chosen": -1.421918272972107, "logps/rejected": -4.02852201461792, "loss": 0.5136, "rewards/accuracies": 1.0, "rewards/chosen": 0.9013915061950684, "rewards/margins": 0.3986455202102661, "rewards/rejected": 0.5027459859848022, "step": 3864 }, { "epoch": 2.08, "learning_rate": 4.900644336440719e-08, "logits/chosen": -2.104112148284912, "logits/rejected": -2.2938990592956543, "logps/chosen": -6.6525373458862305, "logps/rejected": -6.9568562507629395, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": 1.3812336921691895, "rewards/margins": 0.06871461868286133, "rewards/rejected": 1.3125190734863281, "step": 3865 }, { "epoch": 2.09, "learning_rate": 4.898460992486784e-08, "logits/chosen": -2.0868003368377686, "logits/rejected": -2.092832088470459, "logps/chosen": -2.0714149475097656, "logps/rejected": -4.199780464172363, "loss": 0.4081, "rewards/accuracies": 1.0, "rewards/chosen": 1.2202565670013428, "rewards/margins": 0.6853561997413635, "rewards/rejected": 0.5349003672599792, "step": 3866 }, { "epoch": 2.09, "learning_rate": 4.896277667902088e-08, "logits/chosen": -2.004103660583496, "logits/rejected": -2.272860288619995, "logps/chosen": -1.0349459648132324, "logps/rejected": -1.0146844387054443, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.901671826839447, "rewards/margins": 0.004033088684082031, "rewards/rejected": 0.897638738155365, "step": 3867 }, { "epoch": 2.09, "learning_rate": 4.894094363103113e-08, "logits/chosen": -2.0843472480773926, "logits/rejected": -2.29414701461792, "logps/chosen": -6.8487091064453125, "logps/rejected": -2.0607736110687256, "loss": 0.7564, "rewards/accuracies": 0.0, "rewards/chosen": 0.6992992758750916, "rewards/margins": -0.12273985147476196, "rewards/rejected": 0.8220391273498535, "step": 3868 }, { "epoch": 2.09, "learning_rate": 4.8919110785063386e-08, "logits/chosen": -2.2085700035095215, "logits/rejected": -2.029154062271118, "logps/chosen": -23.648162841796875, "logps/rejected": -4.365353584289551, "loss": 0.1805, "rewards/accuracies": 1.0, "rewards/chosen": 2.1902027130126953, "rewards/margins": 1.6203274726867676, "rewards/rejected": 0.5698752403259277, "step": 3869 }, { "epoch": 2.09, "learning_rate": 4.8897278145282436e-08, "logits/chosen": -2.0480074882507324, "logits/rejected": -2.2772328853607178, "logps/chosen": -0.29754382371902466, "logps/rejected": -0.3391651213169098, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9402598738670349, "rewards/margins": 0.016183674335479736, "rewards/rejected": 0.9240761995315552, "step": 3870 }, { "epoch": 2.09, "learning_rate": 4.887544571585297e-08, "logits/chosen": -2.1639797687530518, "logits/rejected": -2.1634397506713867, "logps/chosen": -0.9536874294281006, "logps/rejected": -6.709088325500488, "loss": 0.584, "rewards/accuracies": 1.0, "rewards/chosen": 1.0946857929229736, "rewards/margins": 0.23172849416732788, "rewards/rejected": 0.8629572987556458, "step": 3871 }, { "epoch": 2.09, "learning_rate": 4.8853613500939673e-08, "logits/chosen": -2.2025651931762695, "logits/rejected": -2.2082831859588623, "logps/chosen": -0.20215770602226257, "logps/rejected": -5.17643404006958, "loss": 0.4299, "rewards/accuracies": 1.0, "rewards/chosen": 0.9368336796760559, "rewards/margins": 0.621523916721344, "rewards/rejected": 0.3153097629547119, "step": 3872 }, { "epoch": 2.09, "learning_rate": 4.8831781504707185e-08, "logits/chosen": -1.9742523431777954, "logits/rejected": -2.2758264541625977, "logps/chosen": -0.525604248046875, "logps/rejected": -2.9866325855255127, "loss": 0.5768, "rewards/accuracies": 1.0, "rewards/chosen": 1.0519630908966064, "rewards/margins": 0.24801069498062134, "rewards/rejected": 0.8039523959159851, "step": 3873 }, { "epoch": 2.09, "learning_rate": 4.880994973132012e-08, "logits/chosen": -2.109492778778076, "logits/rejected": -2.074819803237915, "logps/chosen": -16.402366638183594, "logps/rejected": -3.41617488861084, "loss": 0.355, "rewards/accuracies": 1.0, "rewards/chosen": 1.5059289932250977, "rewards/margins": 0.8527303338050842, "rewards/rejected": 0.6531986594200134, "step": 3874 }, { "epoch": 2.09, "learning_rate": 4.878811818494301e-08, "logits/chosen": -2.0808732509613037, "logits/rejected": -2.3684394359588623, "logps/chosen": -0.5976296663284302, "logps/rejected": -0.6495028138160706, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 1.1561425924301147, "rewards/margins": 0.029028892517089844, "rewards/rejected": 1.127113699913025, "step": 3875 }, { "epoch": 2.09, "learning_rate": 4.876628686974037e-08, "logits/chosen": -2.157982349395752, "logits/rejected": -2.142293691635132, "logps/chosen": -9.437813758850098, "logps/rejected": -3.6913061141967773, "loss": 0.487, "rewards/accuracies": 1.0, "rewards/chosen": 1.1137956380844116, "rewards/margins": 0.466091513633728, "rewards/rejected": 0.6477041244506836, "step": 3876 }, { "epoch": 2.09, "learning_rate": 4.874445578987669e-08, "logits/chosen": -2.2129878997802734, "logits/rejected": -2.215252161026001, "logps/chosen": -2.088534355163574, "logps/rejected": -1.0889403820037842, "loss": 0.579, "rewards/accuracies": 1.0, "rewards/chosen": 1.0033625364303589, "rewards/margins": 0.24308085441589355, "rewards/rejected": 0.7602816820144653, "step": 3877 }, { "epoch": 2.09, "learning_rate": 4.872262494951638e-08, "logits/chosen": -2.182283639907837, "logits/rejected": -2.2735114097595215, "logps/chosen": -1.597264289855957, "logps/rejected": -1.5795221328735352, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.8862276077270508, "rewards/margins": -0.011569678783416748, "rewards/rejected": 0.8977972865104675, "step": 3878 }, { "epoch": 2.09, "learning_rate": 4.870079435282381e-08, "logits/chosen": -2.010798215866089, "logits/rejected": -2.011795997619629, "logps/chosen": -3.8390653133392334, "logps/rejected": -3.3086867332458496, "loss": 0.3781, "rewards/accuracies": 1.0, "rewards/chosen": 1.432352066040039, "rewards/margins": 0.777640163898468, "rewards/rejected": 0.654711902141571, "step": 3879 }, { "epoch": 2.09, "learning_rate": 4.867896400396333e-08, "logits/chosen": -2.1400644779205322, "logits/rejected": -2.2798469066619873, "logps/chosen": -0.3364148437976837, "logps/rejected": -0.37153923511505127, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.913293182849884, "rewards/margins": 0.010125458240509033, "rewards/rejected": 0.903167724609375, "step": 3880 }, { "epoch": 2.09, "learning_rate": 4.8657133907099206e-08, "logits/chosen": -2.04166579246521, "logits/rejected": -2.3027420043945312, "logps/chosen": -0.35993555188179016, "logps/rejected": -0.3776376247406006, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.8207191824913025, "rewards/margins": 0.0314677357673645, "rewards/rejected": 0.789251446723938, "step": 3881 }, { "epoch": 2.09, "learning_rate": 4.863530406639568e-08, "logits/chosen": -2.0223662853240967, "logits/rejected": -2.2993555068969727, "logps/chosen": -0.27618902921676636, "logps/rejected": -0.29336726665496826, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 0.8806368112564087, "rewards/margins": -0.013365566730499268, "rewards/rejected": 0.894002377986908, "step": 3882 }, { "epoch": 2.09, "learning_rate": 4.861347448601693e-08, "logits/chosen": -2.055302858352661, "logits/rejected": -2.29152512550354, "logps/chosen": -0.392278254032135, "logps/rejected": -0.41386693716049194, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.9922504425048828, "rewards/margins": 0.020668864250183105, "rewards/rejected": 0.9715815782546997, "step": 3883 }, { "epoch": 2.09, "learning_rate": 4.85916451701271e-08, "logits/chosen": -2.1117091178894043, "logits/rejected": -2.1146140098571777, "logps/chosen": -0.6223200559616089, "logps/rejected": -2.100839614868164, "loss": 0.5517, "rewards/accuracies": 1.0, "rewards/chosen": 0.9466350674629211, "rewards/margins": 0.3062382936477661, "rewards/rejected": 0.640396773815155, "step": 3884 }, { "epoch": 2.1, "learning_rate": 4.8569816122890285e-08, "logits/chosen": -2.06538987159729, "logits/rejected": -2.3533520698547363, "logps/chosen": -0.21565482020378113, "logps/rejected": -0.5059945583343506, "loss": 0.6652, "rewards/accuracies": 1.0, "rewards/chosen": 0.8323907256126404, "rewards/margins": 0.056736767292022705, "rewards/rejected": 0.7756539583206177, "step": 3885 }, { "epoch": 2.1, "learning_rate": 4.854798734847051e-08, "logits/chosen": -2.146327495574951, "logits/rejected": -2.1530182361602783, "logps/chosen": -0.8426843881607056, "logps/rejected": -2.103478193283081, "loss": 0.5064, "rewards/accuracies": 1.0, "rewards/chosen": 0.9591798186302185, "rewards/margins": 0.4165838360786438, "rewards/rejected": 0.5425959825515747, "step": 3886 }, { "epoch": 2.1, "learning_rate": 4.852615885103175e-08, "logits/chosen": -2.19657301902771, "logits/rejected": -2.3148629665374756, "logps/chosen": -1.4007220268249512, "logps/rejected": -1.4403902292251587, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.9845840334892273, "rewards/margins": 0.017629504203796387, "rewards/rejected": 0.9669545292854309, "step": 3887 }, { "epoch": 2.1, "learning_rate": 4.8504330634737944e-08, "logits/chosen": -2.131401777267456, "logits/rejected": -2.3233938217163086, "logps/chosen": -0.8260259628295898, "logps/rejected": -0.83091801404953, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 1.0104814767837524, "rewards/margins": -0.0074307918548583984, "rewards/rejected": 1.0179122686386108, "step": 3888 }, { "epoch": 2.1, "learning_rate": 4.848250270375298e-08, "logits/chosen": -2.0669469833374023, "logits/rejected": -2.0745790004730225, "logps/chosen": -0.3402498960494995, "logps/rejected": -6.853740692138672, "loss": 0.4002, "rewards/accuracies": 1.0, "rewards/chosen": 0.9885539412498474, "rewards/margins": 0.7090005278587341, "rewards/rejected": 0.2795534133911133, "step": 3889 }, { "epoch": 2.1, "learning_rate": 4.8460675062240674e-08, "logits/chosen": -2.136329412460327, "logits/rejected": -2.1338138580322266, "logps/chosen": -9.243010520935059, "logps/rejected": -1.6674100160598755, "loss": 0.515, "rewards/accuracies": 1.0, "rewards/chosen": 1.3129315376281738, "rewards/margins": 0.3951457142829895, "rewards/rejected": 0.9177858233451843, "step": 3890 }, { "epoch": 2.1, "learning_rate": 4.8438847714364786e-08, "logits/chosen": -2.0739521980285645, "logits/rejected": -2.3456313610076904, "logps/chosen": -1.5568050146102905, "logps/rejected": -14.23032283782959, "loss": 0.5089, "rewards/accuracies": 1.0, "rewards/chosen": 1.0002074241638184, "rewards/margins": 0.41021019220352173, "rewards/rejected": 0.5899972319602966, "step": 3891 }, { "epoch": 2.1, "learning_rate": 4.841702066428903e-08, "logits/chosen": -2.0598981380462646, "logits/rejected": -2.0675148963928223, "logps/chosen": -4.921128273010254, "logps/rejected": -3.2348837852478027, "loss": 0.4534, "rewards/accuracies": 1.0, "rewards/chosen": 1.1132407188415527, "rewards/margins": 0.5557814240455627, "rewards/rejected": 0.55745929479599, "step": 3892 }, { "epoch": 2.1, "learning_rate": 4.839519391617708e-08, "logits/chosen": -2.143479824066162, "logits/rejected": -2.2951924800872803, "logps/chosen": -0.3468540906906128, "logps/rejected": -0.34417158365249634, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.8796368837356567, "rewards/margins": 0.025071144104003906, "rewards/rejected": 0.8545657396316528, "step": 3893 }, { "epoch": 2.1, "learning_rate": 4.837336747419249e-08, "logits/chosen": -2.028998613357544, "logits/rejected": -2.0233402252197266, "logps/chosen": -5.205571174621582, "logps/rejected": -2.857357978820801, "loss": 0.4072, "rewards/accuracies": 1.0, "rewards/chosen": 1.3716130256652832, "rewards/margins": 0.6880584359169006, "rewards/rejected": 0.6835545897483826, "step": 3894 }, { "epoch": 2.1, "learning_rate": 4.835154134249882e-08, "logits/chosen": -2.1149890422821045, "logits/rejected": -2.2849199771881104, "logps/chosen": -0.607439398765564, "logps/rejected": -0.636172890663147, "loss": 0.7044, "rewards/accuracies": 0.0, "rewards/chosen": 0.9266619682312012, "rewards/margins": -0.022338569164276123, "rewards/rejected": 0.9490005373954773, "step": 3895 }, { "epoch": 2.1, "learning_rate": 4.8329715525259565e-08, "logits/chosen": -2.208974838256836, "logits/rejected": -2.1237900257110596, "logps/chosen": -30.16231346130371, "logps/rejected": -2.4068562984466553, "loss": 0.1875, "rewards/accuracies": 1.0, "rewards/chosen": 2.3172731399536133, "rewards/margins": 1.57899808883667, "rewards/rejected": 0.7382749915122986, "step": 3896 }, { "epoch": 2.1, "learning_rate": 4.8307890026638124e-08, "logits/chosen": -2.103517770767212, "logits/rejected": -2.286461114883423, "logps/chosen": -0.47755569219589233, "logps/rejected": -0.5095953345298767, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9324180483818054, "rewards/margins": 0.018464267253875732, "rewards/rejected": 0.9139537811279297, "step": 3897 }, { "epoch": 2.1, "learning_rate": 4.8286064850797863e-08, "logits/chosen": -2.038691282272339, "logits/rejected": -2.275820255279541, "logps/chosen": -0.8158649802207947, "logps/rejected": -0.7249857187271118, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0011285543441772, "rewards/margins": 0.0029796957969665527, "rewards/rejected": 0.9981488585472107, "step": 3898 }, { "epoch": 2.1, "learning_rate": 4.8264240001902064e-08, "logits/chosen": -1.9456654787063599, "logits/rejected": -1.9613038301467896, "logps/chosen": -2.8423163890838623, "logps/rejected": -5.154191017150879, "loss": 0.5814, "rewards/accuracies": 1.0, "rewards/chosen": 1.0852397680282593, "rewards/margins": 0.23762553930282593, "rewards/rejected": 0.8476142287254333, "step": 3899 }, { "epoch": 2.1, "learning_rate": 4.8242415484113984e-08, "logits/chosen": -2.1233139038085938, "logits/rejected": -2.123830795288086, "logps/chosen": -1.7565975189208984, "logps/rejected": -1.6935360431671143, "loss": 0.4616, "rewards/accuracies": 1.0, "rewards/chosen": 1.2555793523788452, "rewards/margins": 0.5333701968193054, "rewards/rejected": 0.7222091555595398, "step": 3900 }, { "epoch": 2.1, "learning_rate": 4.822059130159679e-08, "logits/chosen": -2.1452457904815674, "logits/rejected": -2.273836851119995, "logps/chosen": -0.7419697642326355, "logps/rejected": -0.7511414289474487, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9440955519676208, "rewards/margins": 0.004988670349121094, "rewards/rejected": 0.9391068816184998, "step": 3901 }, { "epoch": 2.1, "learning_rate": 4.819876745851358e-08, "logits/chosen": -2.050431251525879, "logits/rejected": -2.2158005237579346, "logps/chosen": -0.4161944091320038, "logps/rejected": -0.4575207531452179, "loss": 0.6688, "rewards/accuracies": 1.0, "rewards/chosen": 0.8610197305679321, "rewards/margins": 0.04924023151397705, "rewards/rejected": 0.8117794990539551, "step": 3902 }, { "epoch": 2.11, "learning_rate": 4.8176943959027406e-08, "logits/chosen": -2.06589937210083, "logits/rejected": -2.3243637084960938, "logps/chosen": -2.8272202014923096, "logps/rejected": -3.345982551574707, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.7950881123542786, "rewards/margins": 0.018649280071258545, "rewards/rejected": 0.77643883228302, "step": 3903 }, { "epoch": 2.11, "learning_rate": 4.815512080730124e-08, "logits/chosen": -2.149820327758789, "logits/rejected": -2.1494946479797363, "logps/chosen": -0.7219605445861816, "logps/rejected": -1.8585710525512695, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.9302499890327454, "rewards/margins": 0.014133572578430176, "rewards/rejected": 0.9161164164543152, "step": 3904 }, { "epoch": 2.11, "learning_rate": 4.813329800749799e-08, "logits/chosen": -2.1056740283966064, "logits/rejected": -2.2719550132751465, "logps/chosen": -8.875268936157227, "logps/rejected": -0.8389021754264832, "loss": 0.7159, "rewards/accuracies": 0.0, "rewards/chosen": 0.8860090374946594, "rewards/margins": -0.04493892192840576, "rewards/rejected": 0.9309479594230652, "step": 3905 }, { "epoch": 2.11, "learning_rate": 4.81114755637805e-08, "logits/chosen": -2.1423351764678955, "logits/rejected": -2.1544249057769775, "logps/chosen": -4.651744842529297, "logps/rejected": -3.7078678607940674, "loss": 0.4045, "rewards/accuracies": 1.0, "rewards/chosen": 1.2894134521484375, "rewards/margins": 0.6961736679077148, "rewards/rejected": 0.5932397842407227, "step": 3906 }, { "epoch": 2.11, "learning_rate": 4.808965348031155e-08, "logits/chosen": -2.1757190227508545, "logits/rejected": -2.1530110836029053, "logps/chosen": -10.015724182128906, "logps/rejected": -7.556789875030518, "loss": 0.2845, "rewards/accuracies": 1.0, "rewards/chosen": 1.6794430017471313, "rewards/margins": 1.1113672256469727, "rewards/rejected": 0.5680757164955139, "step": 3907 }, { "epoch": 2.11, "learning_rate": 4.806783176125385e-08, "logits/chosen": -1.969752311706543, "logits/rejected": -1.9680222272872925, "logps/chosen": -0.49133697152137756, "logps/rejected": -2.4489364624023438, "loss": 0.6035, "rewards/accuracies": 1.0, "rewards/chosen": 0.9833084344863892, "rewards/margins": 0.1881445050239563, "rewards/rejected": 0.7951639294624329, "step": 3908 }, { "epoch": 2.11, "learning_rate": 4.804601041077002e-08, "logits/chosen": -2.1395883560180664, "logits/rejected": -2.1333296298980713, "logps/chosen": -6.203968048095703, "logps/rejected": -6.457589149475098, "loss": 0.3522, "rewards/accuracies": 1.0, "rewards/chosen": 1.1669797897338867, "rewards/margins": 0.8622185587882996, "rewards/rejected": 0.30476123094558716, "step": 3909 }, { "epoch": 2.11, "learning_rate": 4.802418943302263e-08, "logits/chosen": -2.0476300716400146, "logits/rejected": -2.0598819255828857, "logps/chosen": -3.447972536087036, "logps/rejected": -2.6534783840179443, "loss": 0.4753, "rewards/accuracies": 1.0, "rewards/chosen": 1.0593441724777222, "rewards/margins": 0.4968600869178772, "rewards/rejected": 0.562484085559845, "step": 3910 }, { "epoch": 2.11, "learning_rate": 4.80023688321742e-08, "logits/chosen": -2.0953593254089355, "logits/rejected": -2.0922560691833496, "logps/chosen": -2.7280638217926025, "logps/rejected": -6.679497241973877, "loss": 0.2672, "rewards/accuracies": 1.0, "rewards/chosen": 1.5033758878707886, "rewards/margins": 1.1830966472625732, "rewards/rejected": 0.32027918100357056, "step": 3911 }, { "epoch": 2.11, "learning_rate": 4.7980548612387145e-08, "logits/chosen": -2.201134204864502, "logits/rejected": -2.207012891769409, "logps/chosen": -3.2237191200256348, "logps/rejected": -6.489983081817627, "loss": 0.4304, "rewards/accuracies": 1.0, "rewards/chosen": 0.7996225953102112, "rewards/margins": 0.6200360059738159, "rewards/rejected": 0.17958655953407288, "step": 3912 }, { "epoch": 2.11, "learning_rate": 4.79587287778238e-08, "logits/chosen": -2.1083836555480957, "logits/rejected": -2.276817560195923, "logps/chosen": -0.44742798805236816, "logps/rejected": -4.849730968475342, "loss": 0.7184, "rewards/accuracies": 0.0, "rewards/chosen": 0.7629179954528809, "rewards/margins": -0.04986107349395752, "rewards/rejected": 0.8127790689468384, "step": 3913 }, { "epoch": 2.11, "learning_rate": 4.7936909332646457e-08, "logits/chosen": -2.2004687786102295, "logits/rejected": -2.184312105178833, "logps/chosen": -3.037724018096924, "logps/rejected": -10.016036987304688, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 1.2225494384765625, "rewards/margins": 0.8526062965393066, "rewards/rejected": 0.36994314193725586, "step": 3914 }, { "epoch": 2.11, "learning_rate": 4.791509028101732e-08, "logits/chosen": -2.1710758209228516, "logits/rejected": -2.3273260593414307, "logps/chosen": -1.2557640075683594, "logps/rejected": -1.2061196565628052, "loss": 0.7039, "rewards/accuracies": 0.0, "rewards/chosen": 0.8739627003669739, "rewards/margins": -0.021310031414031982, "rewards/rejected": 0.8952727317810059, "step": 3915 }, { "epoch": 2.11, "learning_rate": 4.789327162709851e-08, "logits/chosen": -2.084260940551758, "logits/rejected": -2.2347066402435303, "logps/chosen": -0.1355193704366684, "logps/rejected": -0.11577398329973221, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.674250066280365, "rewards/margins": 0.024250030517578125, "rewards/rejected": 0.6500000357627869, "step": 3916 }, { "epoch": 2.11, "learning_rate": 4.787145337505209e-08, "logits/chosen": -2.0652976036071777, "logits/rejected": -2.068129301071167, "logps/chosen": -0.34394147992134094, "logps/rejected": -4.966452598571777, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 0.8523387312889099, "rewards/margins": 0.43293270468711853, "rewards/rejected": 0.4194060266017914, "step": 3917 }, { "epoch": 2.11, "learning_rate": 4.7849635529040025e-08, "logits/chosen": -2.1610028743743896, "logits/rejected": -2.2883856296539307, "logps/chosen": -0.2153031826019287, "logps/rejected": -0.22579649090766907, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.8650205731391907, "rewards/margins": 0.008466720581054688, "rewards/rejected": 0.856553852558136, "step": 3918 }, { "epoch": 2.11, "learning_rate": 4.7827818093224206e-08, "logits/chosen": -2.140821933746338, "logits/rejected": -2.1320180892944336, "logps/chosen": -0.4042280912399292, "logps/rejected": -7.882798671722412, "loss": 0.4096, "rewards/accuracies": 1.0, "rewards/chosen": 1.1705979108810425, "rewards/margins": 0.680730938911438, "rewards/rejected": 0.4898669719696045, "step": 3919 }, { "epoch": 2.11, "learning_rate": 4.7806001071766476e-08, "logits/chosen": -2.0692431926727295, "logits/rejected": -2.4247682094573975, "logps/chosen": -5.771152019500732, "logps/rejected": -18.467945098876953, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.7828428745269775, "rewards/margins": -0.012191951274871826, "rewards/rejected": 0.7950348258018494, "step": 3920 }, { "epoch": 2.11, "learning_rate": 4.778418446882854e-08, "logits/chosen": -2.1014723777770996, "logits/rejected": -2.1015031337738037, "logps/chosen": -0.6128989458084106, "logps/rejected": -10.39078426361084, "loss": 0.5022, "rewards/accuracies": 1.0, "rewards/chosen": 1.105406641960144, "rewards/margins": 0.4272008538246155, "rewards/rejected": 0.6782057881355286, "step": 3921 }, { "epoch": 2.12, "learning_rate": 4.776236828857211e-08, "logits/chosen": -2.094275951385498, "logits/rejected": -2.10567045211792, "logps/chosen": -3.3549141883850098, "logps/rejected": -3.2526111602783203, "loss": 0.3031, "rewards/accuracies": 1.0, "rewards/chosen": 1.5205354690551758, "rewards/margins": 1.0381618738174438, "rewards/rejected": 0.4823736250400543, "step": 3922 }, { "epoch": 2.12, "learning_rate": 4.7740552535158745e-08, "logits/chosen": -1.9099557399749756, "logits/rejected": -1.9199433326721191, "logps/chosen": -2.794898271560669, "logps/rejected": -5.866766452789307, "loss": 0.3816, "rewards/accuracies": 1.0, "rewards/chosen": 1.193914771080017, "rewards/margins": 0.7666645050048828, "rewards/rejected": 0.4272502362728119, "step": 3923 }, { "epoch": 2.12, "learning_rate": 4.771873721274994e-08, "logits/chosen": -2.1660311222076416, "logits/rejected": -2.200845718383789, "logps/chosen": -0.6701298952102661, "logps/rejected": -7.559679985046387, "loss": 0.7039, "rewards/accuracies": 0.0, "rewards/chosen": 1.0083731412887573, "rewards/margins": -0.02145826816558838, "rewards/rejected": 1.0298314094543457, "step": 3924 }, { "epoch": 2.12, "learning_rate": 4.769692232550711e-08, "logits/chosen": -2.075843334197998, "logits/rejected": -2.073345184326172, "logps/chosen": -6.389111042022705, "logps/rejected": -1.2036950588226318, "loss": 0.3314, "rewards/accuracies": 1.0, "rewards/chosen": 1.7715286016464233, "rewards/margins": 0.9342821836471558, "rewards/rejected": 0.8372464179992676, "step": 3925 }, { "epoch": 2.12, "learning_rate": 4.767510787759158e-08, "logits/chosen": -2.087090492248535, "logits/rejected": -2.0901527404785156, "logps/chosen": -1.194951057434082, "logps/rejected": -11.177116394042969, "loss": 0.4287, "rewards/accuracies": 1.0, "rewards/chosen": 1.1234228610992432, "rewards/margins": 0.6249483823776245, "rewards/rejected": 0.49847450852394104, "step": 3926 }, { "epoch": 2.12, "learning_rate": 4.765329387316462e-08, "logits/chosen": -2.059079170227051, "logits/rejected": -2.053889751434326, "logps/chosen": -5.698726654052734, "logps/rejected": -6.171801567077637, "loss": 0.3756, "rewards/accuracies": 1.0, "rewards/chosen": 1.4069709777832031, "rewards/margins": 0.7854185700416565, "rewards/rejected": 0.6215524077415466, "step": 3927 }, { "epoch": 2.12, "learning_rate": 4.763148031638739e-08, "logits/chosen": -2.0643551349639893, "logits/rejected": -2.06889271736145, "logps/chosen": -3.7594668865203857, "logps/rejected": -4.759261131286621, "loss": 0.3647, "rewards/accuracies": 1.0, "rewards/chosen": 1.274466872215271, "rewards/margins": 0.8207905292510986, "rewards/rejected": 0.45367631316185, "step": 3928 }, { "epoch": 2.12, "learning_rate": 4.760966721142097e-08, "logits/chosen": -1.9662327766418457, "logits/rejected": -1.965777039527893, "logps/chosen": -1.14727783203125, "logps/rejected": -0.8122490048408508, "loss": 0.6166, "rewards/accuracies": 1.0, "rewards/chosen": 0.9374400973320007, "rewards/margins": 0.15947109460830688, "rewards/rejected": 0.7779690027236938, "step": 3929 }, { "epoch": 2.12, "learning_rate": 4.758785456242635e-08, "logits/chosen": -2.1844165325164795, "logits/rejected": -2.1905460357666016, "logps/chosen": -0.3828530013561249, "logps/rejected": -7.1658525466918945, "loss": 0.3584, "rewards/accuracies": 1.0, "rewards/chosen": 1.04412841796875, "rewards/margins": 0.8417022824287415, "rewards/rejected": 0.20242615044116974, "step": 3930 }, { "epoch": 2.12, "learning_rate": 4.7566042373564443e-08, "logits/chosen": -2.119131565093994, "logits/rejected": -2.3114583492279053, "logps/chosen": -4.301393985748291, "logps/rejected": -3.074164390563965, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.6616004705429077, "rewards/margins": 0.03462481498718262, "rewards/rejected": 0.6269756555557251, "step": 3931 }, { "epoch": 2.12, "learning_rate": 4.754423064899605e-08, "logits/chosen": -2.0720651149749756, "logits/rejected": -2.0633740425109863, "logps/chosen": -3.900970220565796, "logps/rejected": -2.694159984588623, "loss": 0.362, "rewards/accuracies": 1.0, "rewards/chosen": 1.6506931781768799, "rewards/margins": 0.8297854065895081, "rewards/rejected": 0.8209077715873718, "step": 3932 }, { "epoch": 2.12, "learning_rate": 4.752241939288193e-08, "logits/chosen": -2.060075283050537, "logits/rejected": -2.23110032081604, "logps/chosen": -0.7187456488609314, "logps/rejected": -0.6627272963523865, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.7242191433906555, "rewards/margins": 0.007159411907196045, "rewards/rejected": 0.7170597314834595, "step": 3933 }, { "epoch": 2.12, "learning_rate": 4.750060860938271e-08, "logits/chosen": -2.0558993816375732, "logits/rejected": -2.0594394207000732, "logps/chosen": -9.822160720825195, "logps/rejected": -4.334898948669434, "loss": 0.6107, "rewards/accuracies": 1.0, "rewards/chosen": 0.9772127270698547, "rewards/margins": 0.17222851514816284, "rewards/rejected": 0.8049842119216919, "step": 3934 }, { "epoch": 2.12, "learning_rate": 4.747879830265894e-08, "logits/chosen": -2.0607268810272217, "logits/rejected": -2.062136650085449, "logps/chosen": -4.491816997528076, "logps/rejected": -3.2423388957977295, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 2.4434564113616943, "rewards/margins": 1.948596715927124, "rewards/rejected": 0.4948596954345703, "step": 3935 }, { "epoch": 2.12, "learning_rate": 4.745698847687109e-08, "logits/chosen": -2.0632827281951904, "logits/rejected": -2.0662660598754883, "logps/chosen": -0.9996715784072876, "logps/rejected": -6.70522403717041, "loss": 0.3774, "rewards/accuracies": 1.0, "rewards/chosen": 1.045972466468811, "rewards/margins": 0.7797979116439819, "rewards/rejected": 0.2661745250225067, "step": 3936 }, { "epoch": 2.12, "learning_rate": 4.743517913617952e-08, "logits/chosen": -2.0318994522094727, "logits/rejected": -2.0310943126678467, "logps/chosen": -2.219459295272827, "logps/rejected": -6.06048059463501, "loss": 0.4223, "rewards/accuracies": 1.0, "rewards/chosen": 1.1324028968811035, "rewards/margins": 0.6434003114700317, "rewards/rejected": 0.4890025556087494, "step": 3937 }, { "epoch": 2.12, "learning_rate": 4.7413370284744506e-08, "logits/chosen": -2.1877825260162354, "logits/rejected": -2.300145387649536, "logps/chosen": -6.14132022857666, "logps/rejected": -0.7857608795166016, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 1.111987829208374, "rewards/margins": 0.017245888710021973, "rewards/rejected": 1.094741940498352, "step": 3938 }, { "epoch": 2.12, "learning_rate": 4.7391561926726235e-08, "logits/chosen": -2.058115243911743, "logits/rejected": -2.269338607788086, "logps/chosen": -1.3487660884857178, "logps/rejected": -1.2272928953170776, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0565022230148315, "rewards/margins": 0.030122876167297363, "rewards/rejected": 1.0263793468475342, "step": 3939 }, { "epoch": 2.13, "learning_rate": 4.736975406628481e-08, "logits/chosen": -2.131538152694702, "logits/rejected": -2.1339919567108154, "logps/chosen": -0.1734507828950882, "logps/rejected": -6.195314884185791, "loss": 0.375, "rewards/accuracies": 1.0, "rewards/chosen": 1.0623830556869507, "rewards/margins": 0.7874256372451782, "rewards/rejected": 0.27495741844177246, "step": 3940 }, { "epoch": 2.13, "learning_rate": 4.734794670758018e-08, "logits/chosen": -2.1688270568847656, "logits/rejected": -2.1587603092193604, "logps/chosen": -7.749030590057373, "logps/rejected": -0.8204509615898132, "loss": 0.4689, "rewards/accuracies": 1.0, "rewards/chosen": 1.4757694005966187, "rewards/margins": 0.513895571231842, "rewards/rejected": 0.9618738293647766, "step": 3941 }, { "epoch": 2.13, "learning_rate": 4.732613985477229e-08, "logits/chosen": -2.059061050415039, "logits/rejected": -2.274155616760254, "logps/chosen": -1.1511828899383545, "logps/rejected": -1.16740083694458, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 0.8423062562942505, "rewards/margins": -0.018452167510986328, "rewards/rejected": 0.8607584238052368, "step": 3942 }, { "epoch": 2.13, "learning_rate": 4.730433351202089e-08, "logits/chosen": -2.2085936069488525, "logits/rejected": -2.2073957920074463, "logps/chosen": -2.8345162868499756, "logps/rejected": -6.069241523742676, "loss": 0.3038, "rewards/accuracies": 1.0, "rewards/chosen": 1.3527551889419556, "rewards/margins": 1.0356628894805908, "rewards/rejected": 0.31709223985671997, "step": 3943 }, { "epoch": 2.13, "learning_rate": 4.728252768348574e-08, "logits/chosen": -2.0176680088043213, "logits/rejected": -2.249474287033081, "logps/chosen": -1.700777530670166, "logps/rejected": -0.6979290843009949, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 1.0580434799194336, "rewards/margins": 0.009820818901062012, "rewards/rejected": 1.0482226610183716, "step": 3944 }, { "epoch": 2.13, "learning_rate": 4.726072237332643e-08, "logits/chosen": -2.0486018657684326, "logits/rejected": -2.043729543685913, "logps/chosen": -8.445764541625977, "logps/rejected": -1.968707799911499, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 1.5385046005249023, "rewards/margins": 0.8680394887924194, "rewards/rejected": 0.6704651117324829, "step": 3945 }, { "epoch": 2.13, "learning_rate": 4.7238917585702465e-08, "logits/chosen": -2.1663119792938232, "logits/rejected": -2.1664810180664062, "logps/chosen": -2.8446178436279297, "logps/rejected": -4.4962077140808105, "loss": 0.2542, "rewards/accuracies": 1.0, "rewards/chosen": 1.7048639059066772, "rewards/margins": 1.2397881746292114, "rewards/rejected": 0.4650757312774658, "step": 3946 }, { "epoch": 2.13, "learning_rate": 4.721711332477324e-08, "logits/chosen": -2.084547281265259, "logits/rejected": -1.9834239482879639, "logps/chosen": -21.95370101928711, "logps/rejected": -2.763749361038208, "loss": 0.141, "rewards/accuracies": 1.0, "rewards/chosen": 2.365563154220581, "rewards/margins": 1.8872902393341064, "rewards/rejected": 0.4782729148864746, "step": 3947 }, { "epoch": 2.13, "learning_rate": 4.719530959469807e-08, "logits/chosen": -2.184953212738037, "logits/rejected": -2.375373363494873, "logps/chosen": -8.472773551940918, "logps/rejected": -12.201708793640137, "loss": 0.7433, "rewards/accuracies": 0.0, "rewards/chosen": 1.162577748298645, "rewards/margins": -0.09785783290863037, "rewards/rejected": 1.2604355812072754, "step": 3948 }, { "epoch": 2.13, "learning_rate": 4.717350639963616e-08, "logits/chosen": -2.235161304473877, "logits/rejected": -2.083158254623413, "logps/chosen": -30.552581787109375, "logps/rejected": -11.442296981811523, "loss": 0.1458, "rewards/accuracies": 1.0, "rewards/chosen": 2.3395283222198486, "rewards/margins": 1.8519072532653809, "rewards/rejected": 0.48762112855911255, "step": 3949 }, { "epoch": 2.13, "learning_rate": 4.7151703743746615e-08, "logits/chosen": -2.140247106552124, "logits/rejected": -2.148836374282837, "logps/chosen": -4.294767379760742, "logps/rejected": -4.95324182510376, "loss": 0.3903, "rewards/accuracies": 1.0, "rewards/chosen": 1.2398499250411987, "rewards/margins": 0.7393304705619812, "rewards/rejected": 0.5005194544792175, "step": 3950 }, { "epoch": 2.13, "learning_rate": 4.7129901631188436e-08, "logits/chosen": -2.114927291870117, "logits/rejected": -2.0917694568634033, "logps/chosen": -15.314740180969238, "logps/rejected": -3.602818012237549, "loss": 0.1943, "rewards/accuracies": 1.0, "rewards/chosen": 2.278282642364502, "rewards/margins": 1.5396020412445068, "rewards/rejected": 0.7386806011199951, "step": 3951 }, { "epoch": 2.13, "learning_rate": 4.7108100066120516e-08, "logits/chosen": -2.024172306060791, "logits/rejected": -2.025953531265259, "logps/chosen": -0.8738425970077515, "logps/rejected": -3.8534185886383057, "loss": 0.5149, "rewards/accuracies": 1.0, "rewards/chosen": 1.1361782550811768, "rewards/margins": 0.395216703414917, "rewards/rejected": 0.7409615516662598, "step": 3952 }, { "epoch": 2.13, "learning_rate": 4.708629905270166e-08, "logits/chosen": -2.061093807220459, "logits/rejected": -2.061068534851074, "logps/chosen": -0.17452627420425415, "logps/rejected": -5.822474956512451, "loss": 0.4018, "rewards/accuracies": 1.0, "rewards/chosen": 0.9974501729011536, "rewards/margins": 0.704126238822937, "rewards/rejected": 0.29332396388053894, "step": 3953 }, { "epoch": 2.13, "learning_rate": 4.706449859509054e-08, "logits/chosen": -2.0952072143554688, "logits/rejected": -2.233351230621338, "logps/chosen": -2.6318070888519287, "logps/rejected": -2.74826979637146, "loss": 0.6666, "rewards/accuracies": 1.0, "rewards/chosen": 0.7016043066978455, "rewards/margins": 0.053746163845062256, "rewards/rejected": 0.6478581428527832, "step": 3954 }, { "epoch": 2.13, "learning_rate": 4.704269869744574e-08, "logits/chosen": -2.115938663482666, "logits/rejected": -2.2035088539123535, "logps/chosen": -2.188626766204834, "logps/rejected": -2.8700826168060303, "loss": 0.6217, "rewards/accuracies": 1.0, "rewards/chosen": 0.9524780511856079, "rewards/margins": 0.1483139991760254, "rewards/rejected": 0.8041640520095825, "step": 3955 }, { "epoch": 2.13, "learning_rate": 4.702089936392574e-08, "logits/chosen": -2.1477348804473877, "logits/rejected": -2.14251708984375, "logps/chosen": -2.4778952598571777, "logps/rejected": -4.51234245300293, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 1.091596007347107, "rewards/margins": 0.4816654920578003, "rewards/rejected": 0.6099305152893066, "step": 3956 }, { "epoch": 2.13, "learning_rate": 4.699910059868892e-08, "logits/chosen": -2.204866409301758, "logits/rejected": -2.2370290756225586, "logps/chosen": -8.446218490600586, "logps/rejected": -9.622739791870117, "loss": 0.631, "rewards/accuracies": 1.0, "rewards/chosen": 1.2231825590133667, "rewards/margins": 0.12850570678710938, "rewards/rejected": 1.0946768522262573, "step": 3957 }, { "epoch": 2.13, "learning_rate": 4.697730240589352e-08, "logits/chosen": -2.16202974319458, "logits/rejected": -2.2911980152130127, "logps/chosen": -4.348359107971191, "logps/rejected": -26.903255462646484, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 1.2866466045379639, "rewards/margins": 0.8525237441062927, "rewards/rejected": 0.43412286043167114, "step": 3958 }, { "epoch": 2.14, "learning_rate": 4.695550478969769e-08, "logits/chosen": -2.080284357070923, "logits/rejected": -2.2670860290527344, "logps/chosen": -0.3921983540058136, "logps/rejected": -0.412739634513855, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.8549901843070984, "rewards/margins": 0.024507761001586914, "rewards/rejected": 0.8304824233055115, "step": 3959 }, { "epoch": 2.14, "learning_rate": 4.6933707754259484e-08, "logits/chosen": -2.1501498222351074, "logits/rejected": -2.352362871170044, "logps/chosen": -1.5927834510803223, "logps/rejected": -6.723196983337402, "loss": 0.6498, "rewards/accuracies": 1.0, "rewards/chosen": 1.224058747291565, "rewards/margins": 0.08862662315368652, "rewards/rejected": 1.1354321241378784, "step": 3960 }, { "epoch": 2.14, "learning_rate": 4.6911911303736815e-08, "logits/chosen": -2.077704906463623, "logits/rejected": -2.078277349472046, "logps/chosen": -4.271366596221924, "logps/rejected": -2.090895652770996, "loss": 0.2808, "rewards/accuracies": 1.0, "rewards/chosen": 1.7421636581420898, "rewards/margins": 1.1265451908111572, "rewards/rejected": 0.6156184077262878, "step": 3961 }, { "epoch": 2.14, "learning_rate": 4.6890115442287516e-08, "logits/chosen": -2.048250675201416, "logits/rejected": -2.0492773056030273, "logps/chosen": -3.893165349960327, "logps/rejected": -3.97399640083313, "loss": 0.2949, "rewards/accuracies": 1.0, "rewards/chosen": 1.642604112625122, "rewards/margins": 1.0702025890350342, "rewards/rejected": 0.5724014639854431, "step": 3962 }, { "epoch": 2.14, "learning_rate": 4.686832017406929e-08, "logits/chosen": -2.1358237266540527, "logits/rejected": -2.1350300312042236, "logps/chosen": -0.9275739789009094, "logps/rejected": -1.1638901233673096, "loss": 0.6574, "rewards/accuracies": 1.0, "rewards/chosen": 0.8958830833435059, "rewards/margins": 0.07290655374526978, "rewards/rejected": 0.8229765295982361, "step": 3963 }, { "epoch": 2.14, "learning_rate": 4.684652550323971e-08, "logits/chosen": -2.0876455307006836, "logits/rejected": -2.087301731109619, "logps/chosen": -0.7878615260124207, "logps/rejected": -2.5206828117370605, "loss": 0.602, "rewards/accuracies": 1.0, "rewards/chosen": 1.1245274543762207, "rewards/margins": 0.19147348403930664, "rewards/rejected": 0.9330539703369141, "step": 3964 }, { "epoch": 2.14, "learning_rate": 4.682473143395626e-08, "logits/chosen": -2.0361673831939697, "logits/rejected": -2.288545846939087, "logps/chosen": -0.33165058493614197, "logps/rejected": -0.37992244958877563, "loss": 0.6654, "rewards/accuracies": 1.0, "rewards/chosen": 0.9696137309074402, "rewards/margins": 0.056287169456481934, "rewards/rejected": 0.9133265614509583, "step": 3965 }, { "epoch": 2.14, "learning_rate": 4.680293797037629e-08, "logits/chosen": -1.9784713983535767, "logits/rejected": -1.976287841796875, "logps/chosen": -2.008451223373413, "logps/rejected": -5.354708671569824, "loss": 0.4753, "rewards/accuracies": 1.0, "rewards/chosen": 1.2472933530807495, "rewards/margins": 0.4966391921043396, "rewards/rejected": 0.7506541609764099, "step": 3966 }, { "epoch": 2.14, "learning_rate": 4.678114511665709e-08, "logits/chosen": -2.1256332397460938, "logits/rejected": -2.296630859375, "logps/chosen": -1.7275428771972656, "logps/rejected": -1.71112060546875, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.7032126784324646, "rewards/margins": -0.004291415214538574, "rewards/rejected": 0.7075040936470032, "step": 3967 }, { "epoch": 2.14, "learning_rate": 4.6759352876955764e-08, "logits/chosen": -2.1906306743621826, "logits/rejected": -2.378187656402588, "logps/chosen": -0.41147667169570923, "logps/rejected": -0.4056655466556549, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 1.0457786321640015, "rewards/margins": 0.009474635124206543, "rewards/rejected": 1.036303997039795, "step": 3968 }, { "epoch": 2.14, "learning_rate": 4.673756125542933e-08, "logits/chosen": -2.0720574855804443, "logits/rejected": -2.0639572143554688, "logps/chosen": -3.6423208713531494, "logps/rejected": -3.6738171577453613, "loss": 0.4922, "rewards/accuracies": 1.0, "rewards/chosen": 0.9553434252738953, "rewards/margins": 0.4526292085647583, "rewards/rejected": 0.502714216709137, "step": 3969 }, { "epoch": 2.14, "learning_rate": 4.671577025623469e-08, "logits/chosen": -2.0617165565490723, "logits/rejected": -2.3056554794311523, "logps/chosen": -1.0612573623657227, "logps/rejected": -1.3193702697753906, "loss": 0.7257, "rewards/accuracies": 0.0, "rewards/chosen": 0.8787417411804199, "rewards/margins": -0.06410998106002808, "rewards/rejected": 0.942851722240448, "step": 3970 }, { "epoch": 2.14, "learning_rate": 4.669397988352861e-08, "logits/chosen": -2.0884125232696533, "logits/rejected": -2.101742744445801, "logps/chosen": -5.086781024932861, "logps/rejected": -5.3966569900512695, "loss": 0.3099, "rewards/accuracies": 1.0, "rewards/chosen": 1.8798980712890625, "rewards/margins": 1.012491226196289, "rewards/rejected": 0.8674067854881287, "step": 3971 }, { "epoch": 2.14, "learning_rate": 4.667219014146775e-08, "logits/chosen": -2.0830469131469727, "logits/rejected": -2.099663496017456, "logps/chosen": -1.1830708980560303, "logps/rejected": -3.174736499786377, "loss": 0.3909, "rewards/accuracies": 1.0, "rewards/chosen": 1.5679203271865845, "rewards/margins": 0.7373485565185547, "rewards/rejected": 0.8305717706680298, "step": 3972 }, { "epoch": 2.14, "learning_rate": 4.6650401034208646e-08, "logits/chosen": -2.046203851699829, "logits/rejected": -2.0921783447265625, "logps/chosen": -6.532109260559082, "logps/rejected": -17.430246353149414, "loss": 0.5625, "rewards/accuracies": 1.0, "rewards/chosen": 1.0435044765472412, "rewards/margins": 0.28086334466934204, "rewards/rejected": 0.7626411318778992, "step": 3973 }, { "epoch": 2.14, "learning_rate": 4.6628612565907724e-08, "logits/chosen": -1.9872355461120605, "logits/rejected": -1.9841903448104858, "logps/chosen": -2.7866482734680176, "logps/rejected": -4.144791603088379, "loss": 0.3614, "rewards/accuracies": 1.0, "rewards/chosen": 1.38225519657135, "rewards/margins": 0.831773579120636, "rewards/rejected": 0.5504816174507141, "step": 3974 }, { "epoch": 2.14, "learning_rate": 4.6606824740721265e-08, "logits/chosen": -2.118732452392578, "logits/rejected": -2.368868589401245, "logps/chosen": -0.9189754724502563, "logps/rejected": -1.0609179735183716, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.7978718280792236, "rewards/margins": 0.016626417636871338, "rewards/rejected": 0.7812454104423523, "step": 3975 }, { "epoch": 2.14, "learning_rate": 4.658503756280545e-08, "logits/chosen": -2.1168463230133057, "logits/rejected": -2.113675117492676, "logps/chosen": -8.05228042602539, "logps/rejected": -3.731912851333618, "loss": 0.4135, "rewards/accuracies": 1.0, "rewards/chosen": 1.2200568914413452, "rewards/margins": 0.6691148281097412, "rewards/rejected": 0.550942063331604, "step": 3976 }, { "epoch": 2.15, "learning_rate": 4.656325103631632e-08, "logits/chosen": -2.052703619003296, "logits/rejected": -2.273977041244507, "logps/chosen": -0.3405710458755493, "logps/rejected": -0.3380952775478363, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.8778629302978516, "rewards/margins": 0.00838249921798706, "rewards/rejected": 0.8694804310798645, "step": 3977 }, { "epoch": 2.15, "learning_rate": 4.654146516540982e-08, "logits/chosen": -2.0835533142089844, "logits/rejected": -2.0890073776245117, "logps/chosen": -1.3292124271392822, "logps/rejected": -2.9915359020233154, "loss": 0.4272, "rewards/accuracies": 1.0, "rewards/chosen": 1.1568915843963623, "rewards/margins": 0.629367470741272, "rewards/rejected": 0.5275241136550903, "step": 3978 }, { "epoch": 2.15, "learning_rate": 4.651967995424172e-08, "logits/chosen": -2.1624350547790527, "logits/rejected": -2.1913540363311768, "logps/chosen": -4.165838718414307, "logps/rejected": -20.68358039855957, "loss": 0.1951, "rewards/accuracies": 1.0, "rewards/chosen": 1.7250648736953735, "rewards/margins": 1.5352407693862915, "rewards/rejected": 0.18982410430908203, "step": 3979 }, { "epoch": 2.15, "learning_rate": 4.649789540696772e-08, "logits/chosen": -2.0956716537475586, "logits/rejected": -2.2721729278564453, "logps/chosen": -0.7540950775146484, "logps/rejected": -0.8833391070365906, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.6693792939186096, "rewards/margins": 0.01975405216217041, "rewards/rejected": 0.6496252417564392, "step": 3980 }, { "epoch": 2.15, "learning_rate": 4.6476111527743346e-08, "logits/chosen": -2.042694330215454, "logits/rejected": -2.040627956390381, "logps/chosen": -6.327687740325928, "logps/rejected": -5.586906433105469, "loss": 0.3306, "rewards/accuracies": 1.0, "rewards/chosen": 1.2718786001205444, "rewards/margins": 0.9370772838592529, "rewards/rejected": 0.3348012864589691, "step": 3981 }, { "epoch": 2.15, "learning_rate": 4.645432832072403e-08, "logits/chosen": -1.9661755561828613, "logits/rejected": -1.9565678834915161, "logps/chosen": -0.6081395149230957, "logps/rejected": -4.444098472595215, "loss": 0.4791, "rewards/accuracies": 1.0, "rewards/chosen": 1.1614010334014893, "rewards/margins": 0.48682504892349243, "rewards/rejected": 0.6745759844779968, "step": 3982 }, { "epoch": 2.15, "learning_rate": 4.643254579006505e-08, "logits/chosen": -2.006446361541748, "logits/rejected": -2.0100789070129395, "logps/chosen": -3.4436731338500977, "logps/rejected": -1.483047366142273, "loss": 0.634, "rewards/accuracies": 1.0, "rewards/chosen": 1.3246079683303833, "rewards/margins": 0.12205874919891357, "rewards/rejected": 1.2025492191314697, "step": 3983 }, { "epoch": 2.15, "learning_rate": 4.641076393992158e-08, "logits/chosen": -2.1325693130493164, "logits/rejected": -2.28407883644104, "logps/chosen": -0.5263062715530396, "logps/rejected": -0.49915000796318054, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.97077876329422, "rewards/margins": 0.008227050304412842, "rewards/rejected": 0.9625517129898071, "step": 3984 }, { "epoch": 2.15, "learning_rate": 4.6388982774448656e-08, "logits/chosen": -2.0681354999542236, "logits/rejected": -2.062347173690796, "logps/chosen": -2.099155902862549, "logps/rejected": -3.92612886428833, "loss": 0.5241, "rewards/accuracies": 1.0, "rewards/chosen": 0.9594446420669556, "rewards/margins": 0.3726964592933655, "rewards/rejected": 0.5867481827735901, "step": 3985 }, { "epoch": 2.15, "learning_rate": 4.6367202297801166e-08, "logits/chosen": -2.0107295513153076, "logits/rejected": -2.277390480041504, "logps/chosen": -4.3987016677856445, "logps/rejected": -4.337441444396973, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 0.7337754368782043, "rewards/margins": -0.008890271186828613, "rewards/rejected": 0.742665708065033, "step": 3986 }, { "epoch": 2.15, "learning_rate": 4.634542251413389e-08, "logits/chosen": -2.1481893062591553, "logits/rejected": -2.0247347354888916, "logps/chosen": -21.724933624267578, "logps/rejected": -2.560448408126831, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 1.7747833728790283, "rewards/margins": 1.1342642307281494, "rewards/rejected": 0.6405190825462341, "step": 3987 }, { "epoch": 2.15, "learning_rate": 4.6323643427601436e-08, "logits/chosen": -2.146674394607544, "logits/rejected": -2.2088537216186523, "logps/chosen": -6.75526237487793, "logps/rejected": -16.668563842773438, "loss": 0.5496, "rewards/accuracies": 1.0, "rewards/chosen": 1.527992844581604, "rewards/margins": 0.31121039390563965, "rewards/rejected": 1.2167824506759644, "step": 3988 }, { "epoch": 2.15, "learning_rate": 4.630186504235836e-08, "logits/chosen": -2.017338752746582, "logits/rejected": -2.028435230255127, "logps/chosen": -1.5028111934661865, "logps/rejected": -2.0620932579040527, "loss": 0.4636, "rewards/accuracies": 1.0, "rewards/chosen": 1.1882673501968384, "rewards/margins": 0.5280611515045166, "rewards/rejected": 0.6602061986923218, "step": 3989 }, { "epoch": 2.15, "learning_rate": 4.6280087362559e-08, "logits/chosen": -2.196194648742676, "logits/rejected": -2.2046632766723633, "logps/chosen": -1.3891491889953613, "logps/rejected": -2.037924289703369, "loss": 0.4695, "rewards/accuracies": 1.0, "rewards/chosen": 1.0677403211593628, "rewards/margins": 0.5120749473571777, "rewards/rejected": 0.5556653738021851, "step": 3990 }, { "epoch": 2.15, "learning_rate": 4.62583103923576e-08, "logits/chosen": -2.192314386367798, "logits/rejected": -2.345365524291992, "logps/chosen": -0.29838496446609497, "logps/rejected": -0.3074236810207367, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.8350514769554138, "rewards/margins": 0.012988865375518799, "rewards/rejected": 0.822062611579895, "step": 3991 }, { "epoch": 2.15, "learning_rate": 4.6236534135908266e-08, "logits/chosen": -2.06321382522583, "logits/rejected": -2.321108818054199, "logps/chosen": -0.3525138795375824, "logps/rejected": -0.44913774728775024, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.7869328856468201, "rewards/margins": -0.01402902603149414, "rewards/rejected": 0.8009619116783142, "step": 3992 }, { "epoch": 2.15, "learning_rate": 4.621475859736497e-08, "logits/chosen": -2.0743138790130615, "logits/rejected": -2.077932119369507, "logps/chosen": -0.4759165346622467, "logps/rejected": -13.542622566223145, "loss": 0.4544, "rewards/accuracies": 1.0, "rewards/chosen": 1.0435525178909302, "rewards/margins": 0.5530855655670166, "rewards/rejected": 0.49046698212623596, "step": 3993 }, { "epoch": 2.15, "learning_rate": 4.619298378088152e-08, "logits/chosen": -2.046719789505005, "logits/rejected": -2.046419858932495, "logps/chosen": -0.29954755306243896, "logps/rejected": -4.3845906257629395, "loss": 0.4727, "rewards/accuracies": 1.0, "rewards/chosen": 1.0722254514694214, "rewards/margins": 0.5036517381668091, "rewards/rejected": 0.5685737133026123, "step": 3994 }, { "epoch": 2.15, "learning_rate": 4.6171209690611605e-08, "logits/chosen": -2.0442843437194824, "logits/rejected": -2.240933895111084, "logps/chosen": -4.443089962005615, "logps/rejected": -0.8160876035690308, "loss": 0.7547, "rewards/accuracies": 0.0, "rewards/chosen": 0.7692543864250183, "rewards/margins": -0.11955302953720093, "rewards/rejected": 0.8888074159622192, "step": 3995 }, { "epoch": 2.16, "learning_rate": 4.61494363307088e-08, "logits/chosen": -2.171025514602661, "logits/rejected": -2.32974910736084, "logps/chosen": -0.40987610816955566, "logps/rejected": -0.3984870910644531, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.9771490097045898, "rewards/margins": 0.010820448398590088, "rewards/rejected": 0.9663285613059998, "step": 3996 }, { "epoch": 2.16, "learning_rate": 4.612766370532649e-08, "logits/chosen": -2.0734596252441406, "logits/rejected": -2.2638421058654785, "logps/chosen": -0.3865582346916199, "logps/rejected": -0.38190457224845886, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8702759146690369, "rewards/margins": 0.02252197265625, "rewards/rejected": 0.8477539420127869, "step": 3997 }, { "epoch": 2.16, "learning_rate": 4.610589181861797e-08, "logits/chosen": -2.0632917881011963, "logits/rejected": -2.225870370864868, "logps/chosen": -1.7367767095565796, "logps/rejected": -1.864225149154663, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 1.0643290281295776, "rewards/margins": 0.0004309415817260742, "rewards/rejected": 1.0638980865478516, "step": 3998 }, { "epoch": 2.16, "learning_rate": 4.608412067473636e-08, "logits/chosen": -1.9743900299072266, "logits/rejected": -2.2136049270629883, "logps/chosen": -0.7598093748092651, "logps/rejected": -0.7722735404968262, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.7691705822944641, "rewards/margins": 0.008113980293273926, "rewards/rejected": 0.7610566020011902, "step": 3999 }, { "epoch": 2.16, "learning_rate": 4.6062350277834647e-08, "logits/chosen": -2.0888912677764893, "logits/rejected": -2.2391932010650635, "logps/chosen": -3.233201265335083, "logps/rejected": -7.139151096343994, "loss": 0.7483, "rewards/accuracies": 0.0, "rewards/chosen": 1.0399225950241089, "rewards/margins": -0.10742950439453125, "rewards/rejected": 1.1473520994186401, "step": 4000 }, { "epoch": 2.16, "learning_rate": 4.604058063206569e-08, "logits/chosen": -2.0839390754699707, "logits/rejected": -2.272674083709717, "logps/chosen": -1.2327122688293457, "logps/rejected": -1.299729585647583, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.7787237763404846, "rewards/margins": 0.00955212116241455, "rewards/rejected": 0.7691716551780701, "step": 4001 }, { "epoch": 2.16, "learning_rate": 4.6018811741582195e-08, "logits/chosen": -2.0515060424804688, "logits/rejected": -2.297854423522949, "logps/chosen": -0.5916714072227478, "logps/rejected": -0.6573439836502075, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.832366943359375, "rewards/margins": 0.021230101585388184, "rewards/rejected": 0.8111368417739868, "step": 4002 }, { "epoch": 2.16, "learning_rate": 4.5997043610536705e-08, "logits/chosen": -2.1345646381378174, "logits/rejected": -2.138417959213257, "logps/chosen": -0.5765081644058228, "logps/rejected": -4.7092108726501465, "loss": 0.4265, "rewards/accuracies": 1.0, "rewards/chosen": 1.0935900211334229, "rewards/margins": 0.6314237117767334, "rewards/rejected": 0.46216627955436707, "step": 4003 }, { "epoch": 2.16, "learning_rate": 4.597527624308165e-08, "logits/chosen": -2.1427578926086426, "logits/rejected": -2.1396055221557617, "logps/chosen": -7.883965969085693, "logps/rejected": -3.5412638187408447, "loss": 0.34, "rewards/accuracies": 1.0, "rewards/chosen": 1.4888025522232056, "rewards/margins": 0.9039312601089478, "rewards/rejected": 0.5848712921142578, "step": 4004 }, { "epoch": 2.16, "learning_rate": 4.5953509643369296e-08, "logits/chosen": -2.0503907203674316, "logits/rejected": -2.043971538543701, "logps/chosen": -4.548341751098633, "logps/rejected": -4.316828727722168, "loss": 0.2698, "rewards/accuracies": 1.0, "rewards/chosen": 1.610698938369751, "rewards/margins": 1.17234468460083, "rewards/rejected": 0.4383542239665985, "step": 4005 }, { "epoch": 2.16, "learning_rate": 4.593174381555176e-08, "logits/chosen": -2.0649216175079346, "logits/rejected": -2.06298828125, "logps/chosen": -0.40476375818252563, "logps/rejected": -2.256948471069336, "loss": 0.5383, "rewards/accuracies": 1.0, "rewards/chosen": 1.0082625150680542, "rewards/margins": 0.3380717635154724, "rewards/rejected": 0.6701907515525818, "step": 4006 }, { "epoch": 2.16, "learning_rate": 4.590997876378103e-08, "logits/chosen": -1.9534335136413574, "logits/rejected": -1.9482954740524292, "logps/chosen": -2.8670504093170166, "logps/rejected": -6.541758060455322, "loss": 0.4767, "rewards/accuracies": 1.0, "rewards/chosen": 0.8430117964744568, "rewards/margins": 0.4929506182670593, "rewards/rejected": 0.35006117820739746, "step": 4007 }, { "epoch": 2.16, "learning_rate": 4.5888214492208916e-08, "logits/chosen": -2.2088141441345215, "logits/rejected": -2.237316846847534, "logps/chosen": -0.6385582089424133, "logps/rejected": -9.597115516662598, "loss": 0.5146, "rewards/accuracies": 1.0, "rewards/chosen": 0.941336452960968, "rewards/margins": 0.39615124464035034, "rewards/rejected": 0.5451852083206177, "step": 4008 }, { "epoch": 2.16, "learning_rate": 4.586645100498711e-08, "logits/chosen": -2.083815574645996, "logits/rejected": -2.07401967048645, "logps/chosen": -4.969385623931885, "logps/rejected": -0.8806419372558594, "loss": 0.4661, "rewards/accuracies": 1.0, "rewards/chosen": 1.526348352432251, "rewards/margins": 0.5211994647979736, "rewards/rejected": 1.0051488876342773, "step": 4009 }, { "epoch": 2.16, "learning_rate": 4.584468830626714e-08, "logits/chosen": -2.2071101665496826, "logits/rejected": -2.2077994346618652, "logps/chosen": -3.413804292678833, "logps/rejected": -4.448538303375244, "loss": 0.4605, "rewards/accuracies": 1.0, "rewards/chosen": 1.5232352018356323, "rewards/margins": 0.536297082901001, "rewards/rejected": 0.9869381189346313, "step": 4010 }, { "epoch": 2.16, "learning_rate": 4.5822926400200365e-08, "logits/chosen": -2.1156554222106934, "logits/rejected": -2.2691526412963867, "logps/chosen": -0.21479327976703644, "logps/rejected": -0.2012266367673874, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9068301320075989, "rewards/margins": 0.003775477409362793, "rewards/rejected": 0.9030546545982361, "step": 4011 }, { "epoch": 2.16, "learning_rate": 4.580116529093804e-08, "logits/chosen": -2.0840821266174316, "logits/rejected": -2.0746009349823, "logps/chosen": -1.390367031097412, "logps/rejected": -12.332466125488281, "loss": 0.2705, "rewards/accuracies": 1.0, "rewards/chosen": 1.1233131885528564, "rewards/margins": 1.1691652536392212, "rewards/rejected": -0.045852091163396835, "step": 4012 }, { "epoch": 2.16, "learning_rate": 4.5779404982631235e-08, "logits/chosen": -2.0101966857910156, "logits/rejected": -2.2772600650787354, "logps/chosen": -1.8356151580810547, "logps/rejected": -1.879327416419983, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.6273291707038879, "rewards/margins": 0.03142780065536499, "rewards/rejected": 0.595901370048523, "step": 4013 }, { "epoch": 2.17, "learning_rate": 4.5757645479430867e-08, "logits/chosen": -2.0234289169311523, "logits/rejected": -2.1083431243896484, "logps/chosen": -1.9329090118408203, "logps/rejected": -19.23066520690918, "loss": 0.7063, "rewards/accuracies": 0.0, "rewards/chosen": 0.8810957074165344, "rewards/margins": -0.026057422161102295, "rewards/rejected": 0.9071531295776367, "step": 4014 }, { "epoch": 2.17, "learning_rate": 4.573588678548771e-08, "logits/chosen": -2.261579751968384, "logits/rejected": -2.415731906890869, "logps/chosen": -0.5345863103866577, "logps/rejected": -0.6195414066314697, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.8626450896263123, "rewards/margins": 0.005350053310394287, "rewards/rejected": 0.857295036315918, "step": 4015 }, { "epoch": 2.17, "learning_rate": 4.571412890495239e-08, "logits/chosen": -2.0694737434387207, "logits/rejected": -2.0699973106384277, "logps/chosen": -1.3160836696624756, "logps/rejected": -2.6898932456970215, "loss": 0.5326, "rewards/accuracies": 1.0, "rewards/chosen": 1.2192705869674683, "rewards/margins": 0.3518335819244385, "rewards/rejected": 0.8674370050430298, "step": 4016 }, { "epoch": 2.17, "learning_rate": 4.569237184197533e-08, "logits/chosen": -2.065143585205078, "logits/rejected": -2.0667426586151123, "logps/chosen": -0.16921067237854004, "logps/rejected": -6.2831339836120605, "loss": 0.4321, "rewards/accuracies": 1.0, "rewards/chosen": 0.9346342086791992, "rewards/margins": 0.615165650844574, "rewards/rejected": 0.31946855783462524, "step": 4017 }, { "epoch": 2.17, "learning_rate": 4.5670615600706865e-08, "logits/chosen": -2.160181999206543, "logits/rejected": -2.1608798503875732, "logps/chosen": -2.607358455657959, "logps/rejected": -3.446455955505371, "loss": 0.4184, "rewards/accuracies": 1.0, "rewards/chosen": 1.4589956998825073, "rewards/margins": 0.654728353023529, "rewards/rejected": 0.8042673468589783, "step": 4018 }, { "epoch": 2.17, "learning_rate": 4.5648860185297135e-08, "logits/chosen": -2.047452211380005, "logits/rejected": -2.2976231575012207, "logps/chosen": -0.4724937379360199, "logps/rejected": -0.5426318645477295, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491764307022095, "rewards/margins": 0.010042309761047363, "rewards/rejected": 0.9391341209411621, "step": 4019 }, { "epoch": 2.17, "learning_rate": 4.562710559989612e-08, "logits/chosen": -2.1360790729522705, "logits/rejected": -2.2573704719543457, "logps/chosen": -0.2556021213531494, "logps/rejected": -0.29642942547798157, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 0.8117000460624695, "rewards/margins": -0.006654858589172363, "rewards/rejected": 0.8183549046516418, "step": 4020 }, { "epoch": 2.17, "learning_rate": 4.560535184865366e-08, "logits/chosen": -2.0678277015686035, "logits/rejected": -2.0684561729431152, "logps/chosen": -0.5853987336158752, "logps/rejected": -5.434847354888916, "loss": 0.4567, "rewards/accuracies": 1.0, "rewards/chosen": 0.9582602381706238, "rewards/margins": 0.5466703772544861, "rewards/rejected": 0.4115898609161377, "step": 4021 }, { "epoch": 2.17, "learning_rate": 4.5583598935719405e-08, "logits/chosen": -2.0943005084991455, "logits/rejected": -2.254936456680298, "logps/chosen": -1.7488551139831543, "logps/rejected": -1.5707520246505737, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 1.084337830543518, "rewards/margins": 0.0016455650329589844, "rewards/rejected": 1.082692265510559, "step": 4022 }, { "epoch": 2.17, "learning_rate": 4.556184686524293e-08, "logits/chosen": -2.1461431980133057, "logits/rejected": -2.151930809020996, "logps/chosen": -2.021042585372925, "logps/rejected": -3.6291377544403076, "loss": 0.4218, "rewards/accuracies": 1.0, "rewards/chosen": 1.3132017850875854, "rewards/margins": 0.6449622511863708, "rewards/rejected": 0.6682395339012146, "step": 4023 }, { "epoch": 2.17, "learning_rate": 4.554009564137353e-08, "logits/chosen": -1.992504596710205, "logits/rejected": -2.2975914478302, "logps/chosen": -0.4681200385093689, "logps/rejected": -0.5829506516456604, "loss": 0.6645, "rewards/accuracies": 1.0, "rewards/chosen": 0.829793393611908, "rewards/margins": 0.05817580223083496, "rewards/rejected": 0.771617591381073, "step": 4024 }, { "epoch": 2.17, "learning_rate": 4.55183452682604e-08, "logits/chosen": -2.086782217025757, "logits/rejected": -2.2998578548431396, "logps/chosen": -0.467302143573761, "logps/rejected": -0.4150645136833191, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 0.8841850161552429, "rewards/margins": -0.021784424781799316, "rewards/rejected": 0.9059694409370422, "step": 4025 }, { "epoch": 2.17, "learning_rate": 4.5496595750052585e-08, "logits/chosen": -2.14874005317688, "logits/rejected": -2.1484925746917725, "logps/chosen": -4.287031650543213, "logps/rejected": -2.531667470932007, "loss": 0.2762, "rewards/accuracies": 1.0, "rewards/chosen": 1.6761223077774048, "rewards/margins": 1.1455345153808594, "rewards/rejected": 0.5305878520011902, "step": 4026 }, { "epoch": 2.17, "learning_rate": 4.5474847090898946e-08, "logits/chosen": -1.9963164329528809, "logits/rejected": -1.9942156076431274, "logps/chosen": -0.5734690427780151, "logps/rejected": -7.912101745605469, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9234141707420349, "rewards/margins": 0.221532940864563, "rewards/rejected": 0.7018812298774719, "step": 4027 }, { "epoch": 2.17, "learning_rate": 4.5453099294948173e-08, "logits/chosen": -2.085299015045166, "logits/rejected": -2.084653854370117, "logps/chosen": -4.790380477905273, "logps/rejected": -5.960116863250732, "loss": 0.4851, "rewards/accuracies": 1.0, "rewards/chosen": 1.0525150299072266, "rewards/margins": 0.4710844159126282, "rewards/rejected": 0.5814306139945984, "step": 4028 }, { "epoch": 2.17, "learning_rate": 4.543135236634881e-08, "logits/chosen": -2.132143974304199, "logits/rejected": -2.1172595024108887, "logps/chosen": -31.733230590820312, "logps/rejected": -27.597431182861328, "loss": 0.2208, "rewards/accuracies": 1.0, "rewards/chosen": 2.1007988452911377, "rewards/margins": 1.397910714149475, "rewards/rejected": 0.7028881311416626, "step": 4029 }, { "epoch": 2.17, "learning_rate": 4.540960630924922e-08, "logits/chosen": -2.039597511291504, "logits/rejected": -2.317176103591919, "logps/chosen": -0.6036754250526428, "logps/rejected": -0.6189000010490417, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.8950307965278625, "rewards/margins": 0.031622231006622314, "rewards/rejected": 0.8634085655212402, "step": 4030 }, { "epoch": 2.17, "learning_rate": 4.538786112779762e-08, "logits/chosen": -2.014408826828003, "logits/rejected": -2.0516843795776367, "logps/chosen": -8.40783977508545, "logps/rejected": -15.973458290100098, "loss": 0.4595, "rewards/accuracies": 1.0, "rewards/chosen": 1.1859978437423706, "rewards/margins": 0.5389959216117859, "rewards/rejected": 0.6470019221305847, "step": 4031 }, { "epoch": 2.17, "learning_rate": 4.5366116826142046e-08, "logits/chosen": -1.9435863494873047, "logits/rejected": -2.2735135555267334, "logps/chosen": -0.23573480546474457, "logps/rejected": -0.2609435021877289, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.98150634765625, "rewards/margins": -0.0005657076835632324, "rewards/rejected": 0.9820720553398132, "step": 4032 }, { "epoch": 2.18, "learning_rate": 4.5344373408430336e-08, "logits/chosen": -2.0397865772247314, "logits/rejected": -2.0317604541778564, "logps/chosen": -10.693236351013184, "logps/rejected": -5.344633102416992, "loss": 0.1929, "rewards/accuracies": 1.0, "rewards/chosen": 2.2532525062561035, "rewards/margins": 1.5474863052368164, "rewards/rejected": 0.7057661414146423, "step": 4033 }, { "epoch": 2.18, "learning_rate": 4.5322630878810245e-08, "logits/chosen": -2.1037487983703613, "logits/rejected": -2.1006784439086914, "logps/chosen": -0.9588636755943298, "logps/rejected": -4.090758323669434, "loss": 0.4358, "rewards/accuracies": 1.0, "rewards/chosen": 1.1397725343704224, "rewards/margins": 0.6047685742378235, "rewards/rejected": 0.5350039601325989, "step": 4034 }, { "epoch": 2.18, "learning_rate": 4.530088924142926e-08, "logits/chosen": -2.0809950828552246, "logits/rejected": -2.3202080726623535, "logps/chosen": -5.604776382446289, "logps/rejected": -1.2193251848220825, "loss": 0.707, "rewards/accuracies": 0.0, "rewards/chosen": 0.6888914108276367, "rewards/margins": -0.027500629425048828, "rewards/rejected": 0.7163920402526855, "step": 4035 }, { "epoch": 2.18, "learning_rate": 4.5279148500434773e-08, "logits/chosen": -2.043635845184326, "logits/rejected": -2.304255723953247, "logps/chosen": -1.1583905220031738, "logps/rejected": -1.2642713785171509, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 1.239538550376892, "rewards/margins": 0.024754047393798828, "rewards/rejected": 1.2147845029830933, "step": 4036 }, { "epoch": 2.18, "learning_rate": 4.525740865997395e-08, "logits/chosen": -2.075608730316162, "logits/rejected": -2.2954416275024414, "logps/chosen": -0.19848810136318207, "logps/rejected": -0.23627305030822754, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.8058273196220398, "rewards/margins": 0.006984233856201172, "rewards/rejected": 0.7988430857658386, "step": 4037 }, { "epoch": 2.18, "learning_rate": 4.5235669724193826e-08, "logits/chosen": -2.1067495346069336, "logits/rejected": -2.116669178009033, "logps/chosen": -5.171528339385986, "logps/rejected": -3.860034704208374, "loss": 0.2799, "rewards/accuracies": 1.0, "rewards/chosen": 1.6887943744659424, "rewards/margins": 1.130136251449585, "rewards/rejected": 0.5586581230163574, "step": 4038 }, { "epoch": 2.18, "learning_rate": 4.521393169724124e-08, "logits/chosen": -2.1122756004333496, "logits/rejected": -2.3104541301727295, "logps/chosen": -1.326998233795166, "logps/rejected": -1.4062365293502808, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 1.0786532163619995, "rewards/margins": 0.02194523811340332, "rewards/rejected": 1.0567079782485962, "step": 4039 }, { "epoch": 2.18, "learning_rate": 4.519219458326288e-08, "logits/chosen": -2.3085310459136963, "logits/rejected": -2.0199732780456543, "logps/chosen": -63.954307556152344, "logps/rejected": -13.126901626586914, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 3.579639434814453, "rewards/margins": 2.8781702518463135, "rewards/rejected": 0.7014692425727844, "step": 4040 }, { "epoch": 2.18, "learning_rate": 4.517045838640521e-08, "logits/chosen": -2.0602126121520996, "logits/rejected": -2.321110486984253, "logps/chosen": -0.22373974323272705, "logps/rejected": -0.22236420214176178, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.9828705787658691, "rewards/margins": 0.039132773876190186, "rewards/rejected": 0.943737804889679, "step": 4041 }, { "epoch": 2.18, "learning_rate": 4.514872311081457e-08, "logits/chosen": -2.041820764541626, "logits/rejected": -2.311680793762207, "logps/chosen": -0.24613319337368011, "logps/rejected": -0.2338975965976715, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.8846228718757629, "rewards/margins": -0.010680675506591797, "rewards/rejected": 0.8953035473823547, "step": 4042 }, { "epoch": 2.18, "learning_rate": 4.512698876063712e-08, "logits/chosen": -2.0546936988830566, "logits/rejected": -2.063718795776367, "logps/chosen": -3.35323429107666, "logps/rejected": -3.6605358123779297, "loss": 0.4035, "rewards/accuracies": 1.0, "rewards/chosen": 1.2518794536590576, "rewards/margins": 0.6989500522613525, "rewards/rejected": 0.5529294013977051, "step": 4043 }, { "epoch": 2.18, "learning_rate": 4.5105255340018796e-08, "logits/chosen": -2.2553317546844482, "logits/rejected": -2.3416147232055664, "logps/chosen": -2.1378936767578125, "logps/rejected": -2.3064632415771484, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.785295307636261, "rewards/margins": 0.012821555137634277, "rewards/rejected": 0.7724737524986267, "step": 4044 }, { "epoch": 2.18, "learning_rate": 4.508352285310544e-08, "logits/chosen": -2.0476999282836914, "logits/rejected": -2.2611958980560303, "logps/chosen": -0.3069217801094055, "logps/rejected": -5.035447597503662, "loss": 0.5442, "rewards/accuracies": 1.0, "rewards/chosen": 0.9474604725837708, "rewards/margins": 0.32407146692276, "rewards/rejected": 0.6233890056610107, "step": 4045 }, { "epoch": 2.18, "learning_rate": 4.5061791304042655e-08, "logits/chosen": -1.9831573963165283, "logits/rejected": -2.2706809043884277, "logps/chosen": -1.317191243171692, "logps/rejected": -1.1624902486801147, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.7884842157363892, "rewards/margins": 0.01965254545211792, "rewards/rejected": 0.7688316702842712, "step": 4046 }, { "epoch": 2.18, "learning_rate": 4.504006069697586e-08, "logits/chosen": -2.2118213176727295, "logits/rejected": -2.220643997192383, "logps/chosen": -1.1488231420516968, "logps/rejected": -3.0628902912139893, "loss": 0.4307, "rewards/accuracies": 1.0, "rewards/chosen": 1.1641582250595093, "rewards/margins": 0.6194044947624207, "rewards/rejected": 0.5447537302970886, "step": 4047 }, { "epoch": 2.18, "learning_rate": 4.5018331036050314e-08, "logits/chosen": -2.1399893760681152, "logits/rejected": -2.028379440307617, "logps/chosen": -22.630207061767578, "logps/rejected": -3.2059199810028076, "loss": 0.1553, "rewards/accuracies": 1.0, "rewards/chosen": 2.3732669353485107, "rewards/margins": 1.7835242748260498, "rewards/rejected": 0.5897426009178162, "step": 4048 }, { "epoch": 2.18, "learning_rate": 4.499660232541111e-08, "logits/chosen": -2.056687116622925, "logits/rejected": -2.2518439292907715, "logps/chosen": -3.7925312519073486, "logps/rejected": -3.396343946456909, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.43549224734306335, "rewards/margins": -0.005595475435256958, "rewards/rejected": 0.4410877227783203, "step": 4049 }, { "epoch": 2.18, "learning_rate": 4.4974874569203127e-08, "logits/chosen": -2.0608952045440674, "logits/rejected": -2.2148478031158447, "logps/chosen": -1.4858185052871704, "logps/rejected": -1.2942715883255005, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.7466697096824646, "rewards/margins": 0.007718086242675781, "rewards/rejected": 0.7389516234397888, "step": 4050 }, { "epoch": 2.19, "learning_rate": 4.495314777157108e-08, "logits/chosen": -2.1082348823547363, "logits/rejected": -2.128248453140259, "logps/chosen": -1.7561852931976318, "logps/rejected": -3.937497615814209, "loss": 0.5233, "rewards/accuracies": 1.0, "rewards/chosen": 1.0842856168746948, "rewards/margins": 0.3746523857116699, "rewards/rejected": 0.7096332311630249, "step": 4051 }, { "epoch": 2.19, "learning_rate": 4.4931421936659514e-08, "logits/chosen": -2.061293601989746, "logits/rejected": -2.302119731903076, "logps/chosen": -1.2598161697387695, "logps/rejected": -1.1130069494247437, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.8704989552497864, "rewards/margins": 0.01752769947052002, "rewards/rejected": 0.8529712557792664, "step": 4052 }, { "epoch": 2.19, "learning_rate": 4.490969706861276e-08, "logits/chosen": -2.1857876777648926, "logits/rejected": -2.3216681480407715, "logps/chosen": -3.621251106262207, "logps/rejected": -10.713069915771484, "loss": 0.5012, "rewards/accuracies": 1.0, "rewards/chosen": 1.0183182954788208, "rewards/margins": 0.42962390184402466, "rewards/rejected": 0.5886943936347961, "step": 4053 }, { "epoch": 2.19, "learning_rate": 4.4887973171574987e-08, "logits/chosen": -2.0691914558410645, "logits/rejected": -2.118598461151123, "logps/chosen": -2.6504344940185547, "logps/rejected": -18.659866333007812, "loss": 0.1905, "rewards/accuracies": 1.0, "rewards/chosen": 1.5416654348373413, "rewards/margins": 1.5613489151000977, "rewards/rejected": -0.019683456048369408, "step": 4054 }, { "epoch": 2.19, "learning_rate": 4.486625024969017e-08, "logits/chosen": -2.255241632461548, "logits/rejected": -2.1403939723968506, "logps/chosen": -26.598207473754883, "logps/rejected": -7.0739030838012695, "loss": 0.2441, "rewards/accuracies": 1.0, "rewards/chosen": 1.558182716369629, "rewards/margins": 1.2858333587646484, "rewards/rejected": 0.27234935760498047, "step": 4055 }, { "epoch": 2.19, "learning_rate": 4.4844528307102096e-08, "logits/chosen": -2.1233580112457275, "logits/rejected": -2.3856492042541504, "logps/chosen": -0.10754562169313431, "logps/rejected": -0.10789494961500168, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.7794767618179321, "rewards/margins": 0.013087093830108643, "rewards/rejected": 0.7663896679878235, "step": 4056 }, { "epoch": 2.19, "learning_rate": 4.4822807347954383e-08, "logits/chosen": -2.245136260986328, "logits/rejected": -2.16085147857666, "logps/chosen": -19.929607391357422, "logps/rejected": -6.25960636138916, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 1.6388866901397705, "rewards/margins": 0.8446176052093506, "rewards/rejected": 0.7942690849304199, "step": 4057 }, { "epoch": 2.19, "learning_rate": 4.480108737639044e-08, "logits/chosen": -2.0670549869537354, "logits/rejected": -2.066427230834961, "logps/chosen": -2.778256416320801, "logps/rejected": -4.8406596183776855, "loss": 0.3024, "rewards/accuracies": 1.0, "rewards/chosen": 1.5529674291610718, "rewards/margins": 1.040823221206665, "rewards/rejected": 0.5121442675590515, "step": 4058 }, { "epoch": 2.19, "learning_rate": 4.4779368396553494e-08, "logits/chosen": -2.112818479537964, "logits/rejected": -2.057732343673706, "logps/chosen": -26.657140731811523, "logps/rejected": -4.412480354309082, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 1.909021258354187, "rewards/margins": 1.3627833127975464, "rewards/rejected": 0.5462379455566406, "step": 4059 }, { "epoch": 2.19, "learning_rate": 4.475765041258659e-08, "logits/chosen": -2.296372652053833, "logits/rejected": -2.3049051761627197, "logps/chosen": -2.8137660026550293, "logps/rejected": -5.087944507598877, "loss": 0.3294, "rewards/accuracies": 1.0, "rewards/chosen": 1.5141277313232422, "rewards/margins": 0.9414077997207642, "rewards/rejected": 0.572719931602478, "step": 4060 }, { "epoch": 2.19, "learning_rate": 4.4735933428632565e-08, "logits/chosen": -2.1045992374420166, "logits/rejected": -2.112032413482666, "logps/chosen": -3.47648286819458, "logps/rejected": -4.765926361083984, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.3299614191055298, "rewards/margins": 0.6485462188720703, "rewards/rejected": 0.6814152002334595, "step": 4061 }, { "epoch": 2.19, "learning_rate": 4.471421744883409e-08, "logits/chosen": -2.032482862472534, "logits/rejected": -2.258446216583252, "logps/chosen": -1.7086961269378662, "logps/rejected": -1.5447388887405396, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9672317504882812, "rewards/margins": 0.023305952548980713, "rewards/rejected": 0.9439257979393005, "step": 4062 }, { "epoch": 2.19, "learning_rate": 4.469250247733364e-08, "logits/chosen": -2.1195449829101562, "logits/rejected": -2.313591241836548, "logps/chosen": -0.8519384860992432, "logps/rejected": -0.7952330708503723, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.8117504119873047, "rewards/margins": 0.0036326050758361816, "rewards/rejected": 0.8081178069114685, "step": 4063 }, { "epoch": 2.19, "learning_rate": 4.4670788518273456e-08, "logits/chosen": -2.064992904663086, "logits/rejected": -2.2837369441986084, "logps/chosen": -5.7606611251831055, "logps/rejected": -0.6570421457290649, "loss": 0.7389, "rewards/accuracies": 0.0, "rewards/chosen": 0.8675962686538696, "rewards/margins": -0.08952921628952026, "rewards/rejected": 0.9571254849433899, "step": 4064 }, { "epoch": 2.19, "learning_rate": 4.4649075575795646e-08, "logits/chosen": -2.050677537918091, "logits/rejected": -2.0509250164031982, "logps/chosen": -6.20991849899292, "logps/rejected": -2.4216713905334473, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 1.5308250188827515, "rewards/margins": 0.7970668077468872, "rewards/rejected": 0.7337582111358643, "step": 4065 }, { "epoch": 2.19, "learning_rate": 4.4627363654042095e-08, "logits/chosen": -2.2019543647766113, "logits/rejected": -2.2091851234436035, "logps/chosen": -0.9823577404022217, "logps/rejected": -2.775362491607666, "loss": 0.484, "rewards/accuracies": 1.0, "rewards/chosen": 0.9668681025505066, "rewards/margins": 0.473924845457077, "rewards/rejected": 0.49294325709342957, "step": 4066 }, { "epoch": 2.19, "learning_rate": 4.460565275715446e-08, "logits/chosen": -2.1367483139038086, "logits/rejected": -2.1369788646698, "logps/chosen": -2.290205478668213, "logps/rejected": -3.249828338623047, "loss": 0.5206, "rewards/accuracies": 1.0, "rewards/chosen": 0.9969789385795593, "rewards/margins": 0.3813035488128662, "rewards/rejected": 0.6156753897666931, "step": 4067 }, { "epoch": 2.19, "learning_rate": 4.45839428892743e-08, "logits/chosen": -2.196479320526123, "logits/rejected": -2.1944758892059326, "logps/chosen": -2.1523914337158203, "logps/rejected": -3.72312593460083, "loss": 0.5059, "rewards/accuracies": 1.0, "rewards/chosen": 1.1147592067718506, "rewards/margins": 0.4179289937019348, "rewards/rejected": 0.6968302130699158, "step": 4068 }, { "epoch": 2.19, "learning_rate": 4.4562234054542894e-08, "logits/chosen": -2.1602141857147217, "logits/rejected": -2.266775369644165, "logps/chosen": -0.5092723965644836, "logps/rejected": -2.5490193367004395, "loss": 0.6548, "rewards/accuracies": 1.0, "rewards/chosen": 1.1382888555526733, "rewards/margins": 0.07827508449554443, "rewards/rejected": 1.060013771057129, "step": 4069 }, { "epoch": 2.2, "learning_rate": 4.454052625710133e-08, "logits/chosen": -2.0845634937286377, "logits/rejected": -2.1090710163116455, "logps/chosen": -22.873384475708008, "logps/rejected": -8.704214096069336, "loss": 0.3064, "rewards/accuracies": 1.0, "rewards/chosen": 1.8727163076400757, "rewards/margins": 1.025838851928711, "rewards/rejected": 0.8468775153160095, "step": 4070 }, { "epoch": 2.2, "learning_rate": 4.4518819501090525e-08, "logits/chosen": -2.115699529647827, "logits/rejected": -2.187647819519043, "logps/chosen": -0.2977822721004486, "logps/rejected": -29.745803833007812, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 0.9704123735427856, "rewards/margins": 1.4243539571762085, "rewards/rejected": -0.45394155383110046, "step": 4071 }, { "epoch": 2.2, "learning_rate": 4.449711379065118e-08, "logits/chosen": -2.0285818576812744, "logits/rejected": -2.0186729431152344, "logps/chosen": -4.172350883483887, "logps/rejected": -5.3518829345703125, "loss": 0.4005, "rewards/accuracies": 1.0, "rewards/chosen": 1.2940785884857178, "rewards/margins": 0.7081111073493958, "rewards/rejected": 0.585967481136322, "step": 4072 }, { "epoch": 2.2, "learning_rate": 4.447540912992381e-08, "logits/chosen": -2.147719621658325, "logits/rejected": -2.1479954719543457, "logps/chosen": -2.08627986907959, "logps/rejected": -4.045481204986572, "loss": 0.2767, "rewards/accuracies": 1.0, "rewards/chosen": 1.5778164863586426, "rewards/margins": 1.1431217193603516, "rewards/rejected": 0.43469473719596863, "step": 4073 }, { "epoch": 2.2, "learning_rate": 4.445370552304871e-08, "logits/chosen": -2.066582441329956, "logits/rejected": -2.2472405433654785, "logps/chosen": -1.7991108894348145, "logps/rejected": -4.196718215942383, "loss": 0.6509, "rewards/accuracies": 1.0, "rewards/chosen": 1.040094017982483, "rewards/margins": 0.0863460898399353, "rewards/rejected": 0.9537479281425476, "step": 4074 }, { "epoch": 2.2, "learning_rate": 4.4432002974166006e-08, "logits/chosen": -1.9510791301727295, "logits/rejected": -1.944673776626587, "logps/chosen": -3.730881690979004, "logps/rejected": -4.538173675537109, "loss": 0.2507, "rewards/accuracies": 1.0, "rewards/chosen": 1.6090961694717407, "rewards/margins": 1.2553688287734985, "rewards/rejected": 0.3537273406982422, "step": 4075 }, { "epoch": 2.2, "learning_rate": 4.441030148741559e-08, "logits/chosen": -2.0393130779266357, "logits/rejected": -2.2726142406463623, "logps/chosen": -0.41923731565475464, "logps/rejected": -0.4460412263870239, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 1.0204936265945435, "rewards/margins": 0.003891468048095703, "rewards/rejected": 1.0166021585464478, "step": 4076 }, { "epoch": 2.2, "learning_rate": 4.438860106693717e-08, "logits/chosen": -2.1121647357940674, "logits/rejected": -2.1213252544403076, "logps/chosen": -2.8637619018554688, "logps/rejected": -6.223055839538574, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 1.0075297355651855, "rewards/margins": 0.025199294090270996, "rewards/rejected": 0.9823304414749146, "step": 4077 }, { "epoch": 2.2, "learning_rate": 4.436690171687023e-08, "logits/chosen": -2.219730854034424, "logits/rejected": -2.087677478790283, "logps/chosen": -31.354185104370117, "logps/rejected": -3.6665701866149902, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 2.1418352127075195, "rewards/margins": 1.4522106647491455, "rewards/rejected": 0.689624547958374, "step": 4078 }, { "epoch": 2.2, "learning_rate": 4.434520344135409e-08, "logits/chosen": -2.0104830265045166, "logits/rejected": -2.0125820636749268, "logps/chosen": -0.7182144522666931, "logps/rejected": -3.8879199028015137, "loss": 0.5156, "rewards/accuracies": 1.0, "rewards/chosen": 0.9847800135612488, "rewards/margins": 0.39365434646606445, "rewards/rejected": 0.5911256670951843, "step": 4079 }, { "epoch": 2.2, "learning_rate": 4.432350624452782e-08, "logits/chosen": -1.9830538034439087, "logits/rejected": -2.312995672225952, "logps/chosen": -0.3379676043987274, "logps/rejected": -0.30660849809646606, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.7832238078117371, "rewards/margins": -0.012173831462860107, "rewards/rejected": 0.7953976392745972, "step": 4080 }, { "epoch": 2.2, "learning_rate": 4.430181013053031e-08, "logits/chosen": -2.125002861022949, "logits/rejected": -2.327991008758545, "logps/chosen": -0.3652251362800598, "logps/rejected": -0.35724011063575745, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9832270741462708, "rewards/margins": 0.012267112731933594, "rewards/rejected": 0.9709599614143372, "step": 4081 }, { "epoch": 2.2, "learning_rate": 4.428011510350024e-08, "logits/chosen": -2.0158512592315674, "logits/rejected": -2.026353359222412, "logps/chosen": -1.1960926055908203, "logps/rejected": -3.083855152130127, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 1.3495051860809326, "rewards/margins": 0.7625038027763367, "rewards/rejected": 0.587001383304596, "step": 4082 }, { "epoch": 2.2, "learning_rate": 4.425842116757608e-08, "logits/chosen": -1.9958934783935547, "logits/rejected": -2.0203232765197754, "logps/chosen": -6.873525619506836, "logps/rejected": -23.60384750366211, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 1.2302321195602417, "rewards/margins": -0.018469810485839844, "rewards/rejected": 1.2487019300460815, "step": 4083 }, { "epoch": 2.2, "learning_rate": 4.423672832689609e-08, "logits/chosen": -2.105170249938965, "logits/rejected": -2.112267255783081, "logps/chosen": -0.45192644000053406, "logps/rejected": -16.37413215637207, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 0.9554433226585388, "rewards/margins": 0.8698256611824036, "rewards/rejected": 0.08561763912439346, "step": 4084 }, { "epoch": 2.2, "learning_rate": 4.421503658559832e-08, "logits/chosen": -2.0673179626464844, "logits/rejected": -2.0661399364471436, "logps/chosen": -4.175352573394775, "logps/rejected": -6.874136924743652, "loss": 0.4766, "rewards/accuracies": 1.0, "rewards/chosen": 1.4010578393936157, "rewards/margins": 0.49334830045700073, "rewards/rejected": 0.907709538936615, "step": 4085 }, { "epoch": 2.2, "learning_rate": 4.419334594782062e-08, "logits/chosen": -2.0264978408813477, "logits/rejected": -2.332569122314453, "logps/chosen": -1.4003901481628418, "logps/rejected": -0.35333988070487976, "loss": 0.7648, "rewards/accuracies": 0.0, "rewards/chosen": 0.9451179504394531, "rewards/margins": -0.13850653171539307, "rewards/rejected": 1.0836244821548462, "step": 4086 }, { "epoch": 2.2, "learning_rate": 4.417165641770062e-08, "logits/chosen": -2.0474634170532227, "logits/rejected": -2.274632453918457, "logps/chosen": -0.5162697434425354, "logps/rejected": -0.5865865349769592, "loss": 0.7048, "rewards/accuracies": 0.0, "rewards/chosen": 0.8180219531059265, "rewards/margins": -0.02315211296081543, "rewards/rejected": 0.8411740660667419, "step": 4087 }, { "epoch": 2.2, "learning_rate": 4.414996799937574e-08, "logits/chosen": -2.247871160507202, "logits/rejected": -2.0439064502716064, "logps/chosen": -36.6531867980957, "logps/rejected": -4.508884429931641, "loss": 0.0885, "rewards/accuracies": 1.0, "rewards/chosen": 2.853307008743286, "rewards/margins": 2.37981915473938, "rewards/rejected": 0.47348785400390625, "step": 4088 }, { "epoch": 2.21, "learning_rate": 4.4128280696983165e-08, "logits/chosen": -2.1133100986480713, "logits/rejected": -2.259852886199951, "logps/chosen": -3.5429298877716064, "logps/rejected": -3.302856922149658, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 0.99976646900177, "rewards/margins": -0.009725689888000488, "rewards/rejected": 1.0094921588897705, "step": 4089 }, { "epoch": 2.21, "learning_rate": 4.410659451465994e-08, "logits/chosen": -2.014146566390991, "logits/rejected": -2.003359317779541, "logps/chosen": -5.242527484893799, "logps/rejected": -5.280299663543701, "loss": 0.4318, "rewards/accuracies": 1.0, "rewards/chosen": 1.153998613357544, "rewards/margins": 0.6162382960319519, "rewards/rejected": 0.537760317325592, "step": 4090 }, { "epoch": 2.21, "learning_rate": 4.408490945654282e-08, "logits/chosen": -2.186277389526367, "logits/rejected": -2.186692714691162, "logps/chosen": -3.804744243621826, "logps/rejected": -3.593440532684326, "loss": 0.3178, "rewards/accuracies": 1.0, "rewards/chosen": 1.5336856842041016, "rewards/margins": 0.9832525849342346, "rewards/rejected": 0.5504330992698669, "step": 4091 }, { "epoch": 2.21, "learning_rate": 4.406322552676838e-08, "logits/chosen": -2.0168936252593994, "logits/rejected": -2.314936637878418, "logps/chosen": -0.2163812667131424, "logps/rejected": -0.2501641809940338, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9713476300239563, "rewards/margins": 0.005790352821350098, "rewards/rejected": 0.9655572772026062, "step": 4092 }, { "epoch": 2.21, "learning_rate": 4.404154272947298e-08, "logits/chosen": -2.209648847579956, "logits/rejected": -2.1881940364837646, "logps/chosen": -15.3096342086792, "logps/rejected": -1.5361733436584473, "loss": 0.3583, "rewards/accuracies": 1.0, "rewards/chosen": 1.5289514064788818, "rewards/margins": 0.8417673110961914, "rewards/rejected": 0.6871840953826904, "step": 4093 }, { "epoch": 2.21, "learning_rate": 4.401986106879273e-08, "logits/chosen": -2.0391080379486084, "logits/rejected": -2.0313384532928467, "logps/chosen": -2.85321307182312, "logps/rejected": -6.2753472328186035, "loss": 0.3463, "rewards/accuracies": 1.0, "rewards/chosen": 1.2209540605545044, "rewards/margins": 0.8823705911636353, "rewards/rejected": 0.33858343958854675, "step": 4094 }, { "epoch": 2.21, "learning_rate": 4.399818054886357e-08, "logits/chosen": -2.0750858783721924, "logits/rejected": -2.2049367427825928, "logps/chosen": -3.5368056297302246, "logps/rejected": -2.9434597492218018, "loss": 0.7394, "rewards/accuracies": 0.0, "rewards/chosen": 0.8902786374092102, "rewards/margins": -0.09041911363601685, "rewards/rejected": 0.980697751045227, "step": 4095 }, { "epoch": 2.21, "learning_rate": 4.3976501173821206e-08, "logits/chosen": -2.129673480987549, "logits/rejected": -2.126462936401367, "logps/chosen": -5.662225246429443, "logps/rejected": -4.908923625946045, "loss": 0.4064, "rewards/accuracies": 1.0, "rewards/chosen": 1.6134328842163086, "rewards/margins": 0.6902959942817688, "rewards/rejected": 0.9231368899345398, "step": 4096 }, { "epoch": 2.21, "learning_rate": 4.395482294780112e-08, "logits/chosen": -2.1083712577819824, "logits/rejected": -2.1236608028411865, "logps/chosen": -4.376196384429932, "logps/rejected": -3.544931411743164, "loss": 0.5301, "rewards/accuracies": 1.0, "rewards/chosen": 1.1607630252838135, "rewards/margins": 0.35784780979156494, "rewards/rejected": 0.8029152154922485, "step": 4097 }, { "epoch": 2.21, "learning_rate": 4.3933145874938564e-08, "logits/chosen": -2.012925148010254, "logits/rejected": -2.242295742034912, "logps/chosen": -0.7986778020858765, "logps/rejected": -0.8187809586524963, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9107730984687805, "rewards/margins": 0.01711750030517578, "rewards/rejected": 0.8936555981636047, "step": 4098 }, { "epoch": 2.21, "learning_rate": 4.3911469959368597e-08, "logits/chosen": -2.0598390102386475, "logits/rejected": -2.2605416774749756, "logps/chosen": -0.47639864683151245, "logps/rejected": -1.6677865982055664, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 0.9440985918045044, "rewards/margins": 0.05526697635650635, "rewards/rejected": 0.888831615447998, "step": 4099 }, { "epoch": 2.21, "learning_rate": 4.3889795205226047e-08, "logits/chosen": -2.0113229751586914, "logits/rejected": -2.015521764755249, "logps/chosen": -1.2696970701217651, "logps/rejected": -5.551253795623779, "loss": 0.3841, "rewards/accuracies": 1.0, "rewards/chosen": 1.0854088068008423, "rewards/margins": 0.7587553262710571, "rewards/rejected": 0.32665345072746277, "step": 4100 }, { "epoch": 2.21, "learning_rate": 4.3868121616645486e-08, "logits/chosen": -2.04348087310791, "logits/rejected": -2.0472114086151123, "logps/chosen": -0.2726667523384094, "logps/rejected": -4.0991411209106445, "loss": 0.4604, "rewards/accuracies": 1.0, "rewards/chosen": 0.9084704518318176, "rewards/margins": 0.5366963148117065, "rewards/rejected": 0.3717741072177887, "step": 4101 }, { "epoch": 2.21, "learning_rate": 4.384644919776133e-08, "logits/chosen": -2.1637349128723145, "logits/rejected": -2.2807085514068604, "logps/chosen": -0.664665699005127, "logps/rejected": -0.5621119737625122, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.896802544593811, "rewards/margins": -0.006532013416290283, "rewards/rejected": 0.9033345580101013, "step": 4102 }, { "epoch": 2.21, "learning_rate": 4.3824777952707734e-08, "logits/chosen": -1.9623034000396729, "logits/rejected": -1.9759621620178223, "logps/chosen": -1.7461018562316895, "logps/rejected": -4.058568954467773, "loss": 0.5102, "rewards/accuracies": 1.0, "rewards/chosen": 0.8574504852294922, "rewards/margins": 0.40696001052856445, "rewards/rejected": 0.45049047470092773, "step": 4103 }, { "epoch": 2.21, "learning_rate": 4.380310788561861e-08, "logits/chosen": -2.1114885807037354, "logits/rejected": -2.2815864086151123, "logps/chosen": -1.9869738817214966, "logps/rejected": -0.5277553796768188, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 1.0435696840286255, "rewards/margins": 0.00484776496887207, "rewards/rejected": 1.0387219190597534, "step": 4104 }, { "epoch": 2.21, "learning_rate": 4.378143900062768e-08, "logits/chosen": -2.1328301429748535, "logits/rejected": -2.2826812267303467, "logps/chosen": -1.2455346584320068, "logps/rejected": -1.2201147079467773, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.8205751776695251, "rewards/margins": -0.00862807035446167, "rewards/rejected": 0.8292032480239868, "step": 4105 }, { "epoch": 2.21, "learning_rate": 4.375977130186844e-08, "logits/chosen": -2.204922914505005, "logits/rejected": -2.363424777984619, "logps/chosen": -1.9823673963546753, "logps/rejected": -2.0927207469940186, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.774223268032074, "rewards/margins": 0.03299993276596069, "rewards/rejected": 0.7412233352661133, "step": 4106 }, { "epoch": 2.22, "learning_rate": 4.373810479347412e-08, "logits/chosen": -2.1291236877441406, "logits/rejected": -2.1341657638549805, "logps/chosen": -2.5551962852478027, "logps/rejected": -5.309340953826904, "loss": 0.4324, "rewards/accuracies": 1.0, "rewards/chosen": 1.014443278312683, "rewards/margins": 0.6143854260444641, "rewards/rejected": 0.400057852268219, "step": 4107 }, { "epoch": 2.22, "learning_rate": 4.371643947957776e-08, "logits/chosen": -2.029393196105957, "logits/rejected": -2.0393028259277344, "logps/chosen": -0.847841739654541, "logps/rejected": -4.9369354248046875, "loss": 0.493, "rewards/accuracies": 1.0, "rewards/chosen": 1.0589216947555542, "rewards/margins": 0.45057159662246704, "rewards/rejected": 0.6083500981330872, "step": 4108 }, { "epoch": 2.22, "learning_rate": 4.369477536431217e-08, "logits/chosen": -2.12528657913208, "logits/rejected": -2.2929232120513916, "logps/chosen": -0.18053388595581055, "logps/rejected": -0.21685245633125305, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.8420059084892273, "rewards/margins": 0.025475263595581055, "rewards/rejected": 0.8165306448936462, "step": 4109 }, { "epoch": 2.22, "learning_rate": 4.367311245180992e-08, "logits/chosen": -1.9985473155975342, "logits/rejected": -1.9948333501815796, "logps/chosen": -2.6038167476654053, "logps/rejected": -1.6759816408157349, "loss": 0.5916, "rewards/accuracies": 1.0, "rewards/chosen": 1.2675278186798096, "rewards/margins": 0.2146378755569458, "rewards/rejected": 1.0528899431228638, "step": 4110 }, { "epoch": 2.22, "learning_rate": 4.365145074620335e-08, "logits/chosen": -1.9842474460601807, "logits/rejected": -1.9912784099578857, "logps/chosen": -2.827878952026367, "logps/rejected": -4.987093925476074, "loss": 0.358, "rewards/accuracies": 1.0, "rewards/chosen": 1.4332786798477173, "rewards/margins": 0.8428473472595215, "rewards/rejected": 0.5904313325881958, "step": 4111 }, { "epoch": 2.22, "learning_rate": 4.362979025162456e-08, "logits/chosen": -2.137805461883545, "logits/rejected": -2.1361193656921387, "logps/chosen": -4.5428595542907715, "logps/rejected": -5.884129524230957, "loss": 0.3135, "rewards/accuracies": 1.0, "rewards/chosen": 1.2806662321090698, "rewards/margins": 0.9991918206214905, "rewards/rejected": 0.28147441148757935, "step": 4112 }, { "epoch": 2.22, "learning_rate": 4.360813097220546e-08, "logits/chosen": -2.061795473098755, "logits/rejected": -2.0713112354278564, "logps/chosen": -5.480813980102539, "logps/rejected": -2.365143060684204, "loss": 0.4387, "rewards/accuracies": 1.0, "rewards/chosen": 1.351269006729126, "rewards/margins": 0.5965688228607178, "rewards/rejected": 0.7547001838684082, "step": 4113 }, { "epoch": 2.22, "learning_rate": 4.358647291207769e-08, "logits/chosen": -2.0398671627044678, "logits/rejected": -2.046945571899414, "logps/chosen": -3.6924889087677, "logps/rejected": -6.200760841369629, "loss": 0.3895, "rewards/accuracies": 1.0, "rewards/chosen": 1.014265537261963, "rewards/margins": 0.741807222366333, "rewards/rejected": 0.2724582850933075, "step": 4114 }, { "epoch": 2.22, "learning_rate": 4.3564816075372665e-08, "logits/chosen": -2.0506911277770996, "logits/rejected": -2.3499715328216553, "logps/chosen": -0.41518503427505493, "logps/rejected": -0.3151368200778961, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.9301254153251648, "rewards/margins": 0.03297299146652222, "rewards/rejected": 0.8971524238586426, "step": 4115 }, { "epoch": 2.22, "learning_rate": 4.354316046622159e-08, "logits/chosen": -2.0572099685668945, "logits/rejected": -2.0635619163513184, "logps/chosen": -1.617258071899414, "logps/rejected": -3.5229508876800537, "loss": 0.4701, "rewards/accuracies": 1.0, "rewards/chosen": 1.084030270576477, "rewards/margins": 0.5105498433113098, "rewards/rejected": 0.5734804272651672, "step": 4116 }, { "epoch": 2.22, "learning_rate": 4.352150608875539e-08, "logits/chosen": -2.002023696899414, "logits/rejected": -2.339613437652588, "logps/chosen": -1.2351667881011963, "logps/rejected": -4.169061183929443, "loss": 0.6252, "rewards/accuracies": 1.0, "rewards/chosen": 0.9982852935791016, "rewards/margins": 0.14092272520065308, "rewards/rejected": 0.8573625683784485, "step": 4117 }, { "epoch": 2.22, "learning_rate": 4.349985294710477e-08, "logits/chosen": -1.9844399690628052, "logits/rejected": -1.9784138202667236, "logps/chosen": -6.318748950958252, "logps/rejected": -3.088264226913452, "loss": 0.3087, "rewards/accuracies": 1.0, "rewards/chosen": 1.579413652420044, "rewards/margins": 1.0169217586517334, "rewards/rejected": 0.5624918937683105, "step": 4118 }, { "epoch": 2.22, "learning_rate": 4.347820104540025e-08, "logits/chosen": -2.1678452491760254, "logits/rejected": -2.1726832389831543, "logps/chosen": -2.478567123413086, "logps/rejected": -9.199832916259766, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 1.4366098642349243, "rewards/margins": 0.6724489331245422, "rewards/rejected": 0.7641609311103821, "step": 4119 }, { "epoch": 2.22, "learning_rate": 4.345655038777205e-08, "logits/chosen": -2.0391571521759033, "logits/rejected": -2.0299715995788574, "logps/chosen": -8.695279121398926, "logps/rejected": -0.7954986095428467, "loss": 0.5328, "rewards/accuracies": 1.0, "rewards/chosen": 1.2590316534042358, "rewards/margins": 0.35151809453964233, "rewards/rejected": 0.9075135588645935, "step": 4120 }, { "epoch": 2.22, "learning_rate": 4.343490097835018e-08, "logits/chosen": -2.051392078399658, "logits/rejected": -2.278878688812256, "logps/chosen": -0.20362283289432526, "logps/rejected": -0.23698864877223969, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.8408519625663757, "rewards/margins": 0.021520376205444336, "rewards/rejected": 0.8193315863609314, "step": 4121 }, { "epoch": 2.22, "learning_rate": 4.341325282126439e-08, "logits/chosen": -1.9969637393951416, "logits/rejected": -2.30667781829834, "logps/chosen": -0.828599214553833, "logps/rejected": -0.9141616821289062, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.7741844058036804, "rewards/margins": 0.03572899103164673, "rewards/rejected": 0.7384554147720337, "step": 4122 }, { "epoch": 2.22, "learning_rate": 4.339160592064423e-08, "logits/chosen": -2.023237466812134, "logits/rejected": -2.3238532543182373, "logps/chosen": -4.916510105133057, "logps/rejected": -2.8102707862854004, "loss": 0.7361, "rewards/accuracies": 0.0, "rewards/chosen": 0.5591760277748108, "rewards/margins": -0.08406656980514526, "rewards/rejected": 0.643242597579956, "step": 4123 }, { "epoch": 2.22, "learning_rate": 4.3369960280619e-08, "logits/chosen": -2.043102264404297, "logits/rejected": -2.285346746444702, "logps/chosen": -7.503424644470215, "logps/rejected": -7.108705043792725, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.9302911162376404, "rewards/margins": 0.006148815155029297, "rewards/rejected": 0.9241423010826111, "step": 4124 }, { "epoch": 2.22, "learning_rate": 4.334831590531773e-08, "logits/chosen": -2.161012649536133, "logits/rejected": -2.2707886695861816, "logps/chosen": -0.7500635385513306, "logps/rejected": -2.019880771636963, "loss": 0.6522, "rewards/accuracies": 1.0, "rewards/chosen": 0.9050533175468445, "rewards/margins": 0.08359217643737793, "rewards/rejected": 0.8214611411094666, "step": 4125 }, { "epoch": 2.23, "learning_rate": 4.332667279886923e-08, "logits/chosen": -2.0706863403320312, "logits/rejected": -2.076896905899048, "logps/chosen": -2.260566234588623, "logps/rejected": -4.125523090362549, "loss": 0.5925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9475540518760681, "rewards/margins": 0.21263575553894043, "rewards/rejected": 0.7349182963371277, "step": 4126 }, { "epoch": 2.23, "learning_rate": 4.3305030965402075e-08, "logits/chosen": -2.0837740898132324, "logits/rejected": -2.3323183059692383, "logps/chosen": -0.6588894128799438, "logps/rejected": -0.5690422058105469, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.9864276051521301, "rewards/margins": 0.007069289684295654, "rewards/rejected": 0.9793583154678345, "step": 4127 }, { "epoch": 2.23, "learning_rate": 4.3283390409044575e-08, "logits/chosen": -2.0376057624816895, "logits/rejected": -2.2827160358428955, "logps/chosen": -0.587146520614624, "logps/rejected": -5.3606977462768555, "loss": 0.5112, "rewards/accuracies": 1.0, "rewards/chosen": 1.1337915658950806, "rewards/margins": 0.40448933839797974, "rewards/rejected": 0.7293022274971008, "step": 4128 }, { "epoch": 2.23, "learning_rate": 4.326175113392482e-08, "logits/chosen": -2.0811009407043457, "logits/rejected": -2.0758514404296875, "logps/chosen": -3.3233962059020996, "logps/rejected": -10.40326976776123, "loss": 0.4904, "rewards/accuracies": 1.0, "rewards/chosen": 1.3455654382705688, "rewards/margins": 0.4573182463645935, "rewards/rejected": 0.8882471919059753, "step": 4129 }, { "epoch": 2.23, "learning_rate": 4.324011314417064e-08, "logits/chosen": -2.074850082397461, "logits/rejected": -2.3178303241729736, "logps/chosen": -0.4961220324039459, "logps/rejected": -0.5291635990142822, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 0.9411015510559082, "rewards/margins": -0.015421867370605469, "rewards/rejected": 0.9565234184265137, "step": 4130 }, { "epoch": 2.23, "learning_rate": 4.321847644390963e-08, "logits/chosen": -2.000023603439331, "logits/rejected": -2.2559304237365723, "logps/chosen": -0.685639500617981, "logps/rejected": -0.6247477531433105, "loss": 0.668, "rewards/accuracies": 1.0, "rewards/chosen": 0.9829621315002441, "rewards/margins": 0.050875186920166016, "rewards/rejected": 0.9320869445800781, "step": 4131 }, { "epoch": 2.23, "learning_rate": 4.3196841037269124e-08, "logits/chosen": -2.1627197265625, "logits/rejected": -2.269969940185547, "logps/chosen": -5.443058013916016, "logps/rejected": -8.866828918457031, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": 0.8829582333564758, "rewards/margins": 0.22958266735076904, "rewards/rejected": 0.6533755660057068, "step": 4132 }, { "epoch": 2.23, "learning_rate": 4.317520692837624e-08, "logits/chosen": -1.9798667430877686, "logits/rejected": -2.2725274562835693, "logps/chosen": -0.6594988107681274, "logps/rejected": -0.6311294436454773, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.9134357571601868, "rewards/margins": 0.025078654289245605, "rewards/rejected": 0.8883571028709412, "step": 4133 }, { "epoch": 2.23, "learning_rate": 4.3153574121357786e-08, "logits/chosen": -2.157285690307617, "logits/rejected": -2.2666311264038086, "logps/chosen": -4.771664619445801, "logps/rejected": -4.510338306427002, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.8600605130195618, "rewards/margins": 0.010609447956085205, "rewards/rejected": 0.8494510650634766, "step": 4134 }, { "epoch": 2.23, "learning_rate": 4.31319426203404e-08, "logits/chosen": -1.995076060295105, "logits/rejected": -2.0043861865997314, "logps/chosen": -1.6381272077560425, "logps/rejected": -2.7500691413879395, "loss": 0.4739, "rewards/accuracies": 1.0, "rewards/chosen": 1.0856807231903076, "rewards/margins": 0.5004171133041382, "rewards/rejected": 0.5852636098861694, "step": 4135 }, { "epoch": 2.23, "learning_rate": 4.311031242945043e-08, "logits/chosen": -2.1216070652008057, "logits/rejected": -2.3115696907043457, "logps/chosen": -0.8518164753913879, "logps/rejected": -0.8567343950271606, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8973472714424133, "rewards/margins": 0.018076539039611816, "rewards/rejected": 0.8792707324028015, "step": 4136 }, { "epoch": 2.23, "learning_rate": 4.3088683552813966e-08, "logits/chosen": -2.0316410064697266, "logits/rejected": -2.0291285514831543, "logps/chosen": -6.753223419189453, "logps/rejected": -5.358921051025391, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": 1.4170421361923218, "rewards/margins": 1.0019234418869019, "rewards/rejected": 0.4151186943054199, "step": 4137 }, { "epoch": 2.23, "learning_rate": 4.306705599455687e-08, "logits/chosen": -2.1759696006774902, "logits/rejected": -2.176074981689453, "logps/chosen": -0.8065171837806702, "logps/rejected": -3.034724712371826, "loss": 0.6159, "rewards/accuracies": 1.0, "rewards/chosen": 0.9926202893257141, "rewards/margins": 0.16103899478912354, "rewards/rejected": 0.8315812945365906, "step": 4138 }, { "epoch": 2.23, "learning_rate": 4.304542975880474e-08, "logits/chosen": -2.0745482444763184, "logits/rejected": -2.070128917694092, "logps/chosen": -4.858457088470459, "logps/rejected": -4.438973426818848, "loss": 0.4285, "rewards/accuracies": 1.0, "rewards/chosen": 1.0397695302963257, "rewards/margins": 0.6254511475563049, "rewards/rejected": 0.41431838274002075, "step": 4139 }, { "epoch": 2.23, "learning_rate": 4.302380484968292e-08, "logits/chosen": -2.0157885551452637, "logits/rejected": -2.0160837173461914, "logps/chosen": -0.5131184458732605, "logps/rejected": -4.10789680480957, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 1.0447489023208618, "rewards/margins": 0.5209379196166992, "rewards/rejected": 0.5238109827041626, "step": 4140 }, { "epoch": 2.23, "learning_rate": 4.30021812713165e-08, "logits/chosen": -2.0648202896118164, "logits/rejected": -2.3053338527679443, "logps/chosen": -1.8554720878601074, "logps/rejected": -1.8489224910736084, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.9175688028335571, "rewards/margins": 0.020847737789154053, "rewards/rejected": 0.8967210650444031, "step": 4141 }, { "epoch": 2.23, "learning_rate": 4.2980559027830334e-08, "logits/chosen": -2.1037838459014893, "logits/rejected": -2.1028878688812256, "logps/chosen": -2.7856884002685547, "logps/rejected": -5.07077169418335, "loss": 0.3235, "rewards/accuracies": 1.0, "rewards/chosen": 1.439243197441101, "rewards/margins": 0.9622868299484253, "rewards/rejected": 0.4769563376903534, "step": 4142 }, { "epoch": 2.23, "learning_rate": 4.295893812334899e-08, "logits/chosen": -2.038485527038574, "logits/rejected": -2.0185914039611816, "logps/chosen": -4.416633605957031, "logps/rejected": -5.9221343994140625, "loss": 0.2631, "rewards/accuracies": 1.0, "rewards/chosen": 1.5472793579101562, "rewards/margins": 1.200671672821045, "rewards/rejected": 0.34660768508911133, "step": 4143 }, { "epoch": 2.24, "learning_rate": 4.293731856199681e-08, "logits/chosen": -2.060530662536621, "logits/rejected": -2.063351631164551, "logps/chosen": -1.1015303134918213, "logps/rejected": -4.467949867248535, "loss": 0.4589, "rewards/accuracies": 1.0, "rewards/chosen": 1.2341595888137817, "rewards/margins": 0.5406999588012695, "rewards/rejected": 0.6934596300125122, "step": 4144 }, { "epoch": 2.24, "learning_rate": 4.291570034789785e-08, "logits/chosen": -2.068056583404541, "logits/rejected": -2.0776748657226562, "logps/chosen": -2.741708993911743, "logps/rejected": -1.3456233739852905, "loss": 0.4344, "rewards/accuracies": 1.0, "rewards/chosen": 1.346327781677246, "rewards/margins": 0.6088725328445435, "rewards/rejected": 0.7374552488327026, "step": 4145 }, { "epoch": 2.24, "learning_rate": 4.289408348517597e-08, "logits/chosen": -2.1117544174194336, "logits/rejected": -2.3026046752929688, "logps/chosen": -1.5484975576400757, "logps/rejected": -1.2456302642822266, "loss": 0.7219, "rewards/accuracies": 0.0, "rewards/chosen": 0.9043890833854675, "rewards/margins": -0.056723713874816895, "rewards/rejected": 0.9611127972602844, "step": 4146 }, { "epoch": 2.24, "learning_rate": 4.2872467977954696e-08, "logits/chosen": -2.1259195804595947, "logits/rejected": -2.124485969543457, "logps/chosen": -1.25612211227417, "logps/rejected": -3.3814449310302734, "loss": 0.5317, "rewards/accuracies": 1.0, "rewards/chosen": 0.9062202572822571, "rewards/margins": 0.3540724515914917, "rewards/rejected": 0.5521478056907654, "step": 4147 }, { "epoch": 2.24, "learning_rate": 4.285085383035734e-08, "logits/chosen": -2.145301580429077, "logits/rejected": -2.3022024631500244, "logps/chosen": -0.6671255826950073, "logps/rejected": -1.3104974031448364, "loss": 0.6281, "rewards/accuracies": 1.0, "rewards/chosen": 0.950478196144104, "rewards/margins": 0.13451731204986572, "rewards/rejected": 0.8159608840942383, "step": 4148 }, { "epoch": 2.24, "learning_rate": 4.282924104650694e-08, "logits/chosen": -2.063807487487793, "logits/rejected": -2.3281142711639404, "logps/chosen": -1.2120301723480225, "logps/rejected": -3.3858394622802734, "loss": 0.5951, "rewards/accuracies": 1.0, "rewards/chosen": 1.1174092292785645, "rewards/margins": 0.20673978328704834, "rewards/rejected": 0.9106694459915161, "step": 4149 }, { "epoch": 2.24, "learning_rate": 4.2807629630526276e-08, "logits/chosen": -1.980373501777649, "logits/rejected": -2.305176019668579, "logps/chosen": -0.10926610976457596, "logps/rejected": -0.10345273464918137, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.8356416821479797, "rewards/margins": 0.007781386375427246, "rewards/rejected": 0.8278602957725525, "step": 4150 }, { "epoch": 2.24, "learning_rate": 4.278601958653788e-08, "logits/chosen": -2.1162774562835693, "logits/rejected": -2.0977840423583984, "logps/chosen": -4.155892372131348, "logps/rejected": -5.2930006980896, "loss": 0.4518, "rewards/accuracies": 1.0, "rewards/chosen": 1.1576366424560547, "rewards/margins": 0.5600131154060364, "rewards/rejected": 0.5976235270500183, "step": 4151 }, { "epoch": 2.24, "learning_rate": 4.276441091866399e-08, "logits/chosen": -2.1779940128326416, "logits/rejected": -2.187349557876587, "logps/chosen": -1.8587007522583008, "logps/rejected": -3.0855085849761963, "loss": 0.4766, "rewards/accuracies": 1.0, "rewards/chosen": 1.1329139471054077, "rewards/margins": 0.49336981773376465, "rewards/rejected": 0.6395441293716431, "step": 4152 }, { "epoch": 2.24, "learning_rate": 4.2742803631026616e-08, "logits/chosen": -2.1637187004089355, "logits/rejected": -2.163975477218628, "logps/chosen": -0.6607152819633484, "logps/rejected": -5.076075553894043, "loss": 0.3627, "rewards/accuracies": 1.0, "rewards/chosen": 1.1004979610443115, "rewards/margins": 0.8274383544921875, "rewards/rejected": 0.273059606552124, "step": 4153 }, { "epoch": 2.24, "learning_rate": 4.272119772774749e-08, "logits/chosen": -2.0387468338012695, "logits/rejected": -2.301870584487915, "logps/chosen": -2.3809783458709717, "logps/rejected": -2.2026500701904297, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.7106825113296509, "rewards/margins": 0.00837939977645874, "rewards/rejected": 0.7023031115531921, "step": 4154 }, { "epoch": 2.24, "learning_rate": 4.2699593212948084e-08, "logits/chosen": -2.089172124862671, "logits/rejected": -2.2606821060180664, "logps/chosen": -6.599586486816406, "logps/rejected": -10.888845443725586, "loss": 0.597, "rewards/accuracies": 1.0, "rewards/chosen": 0.8165493011474609, "rewards/margins": 0.20258080959320068, "rewards/rejected": 0.6139684915542603, "step": 4155 }, { "epoch": 2.24, "learning_rate": 4.2677990090749605e-08, "logits/chosen": -2.148047924041748, "logits/rejected": -2.1317925453186035, "logps/chosen": -8.251320838928223, "logps/rejected": -4.545150279998779, "loss": 0.3101, "rewards/accuracies": 1.0, "rewards/chosen": 1.5193564891815186, "rewards/margins": 1.011847972869873, "rewards/rejected": 0.5075085163116455, "step": 4156 }, { "epoch": 2.24, "learning_rate": 4.2656388365272966e-08, "logits/chosen": -2.160735607147217, "logits/rejected": -2.1584129333496094, "logps/chosen": -3.753343343734741, "logps/rejected": -5.2577972412109375, "loss": 0.2241, "rewards/accuracies": 1.0, "rewards/chosen": 1.7236347198486328, "rewards/margins": 1.381654977798462, "rewards/rejected": 0.3419797122478485, "step": 4157 }, { "epoch": 2.24, "learning_rate": 4.263478804063888e-08, "logits/chosen": -2.0840322971343994, "logits/rejected": -2.0827364921569824, "logps/chosen": -0.8927035927772522, "logps/rejected": -2.255328893661499, "loss": 0.5575, "rewards/accuracies": 1.0, "rewards/chosen": 1.0244406461715698, "rewards/margins": 0.2926155924797058, "rewards/rejected": 0.731825053691864, "step": 4158 }, { "epoch": 2.24, "learning_rate": 4.2613189120967716e-08, "logits/chosen": -2.0717172622680664, "logits/rejected": -2.0666263103485107, "logps/chosen": -9.63491153717041, "logps/rejected": -6.111003875732422, "loss": 0.2349, "rewards/accuracies": 1.0, "rewards/chosen": 1.8786567449569702, "rewards/margins": 1.3288631439208984, "rewards/rejected": 0.5497936606407166, "step": 4159 }, { "epoch": 2.24, "learning_rate": 4.259159161037963e-08, "logits/chosen": -1.9856778383255005, "logits/rejected": -2.257800340652466, "logps/chosen": -1.4754301309585571, "logps/rejected": -1.5502586364746094, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.9283881187438965, "rewards/margins": 0.014751255512237549, "rewards/rejected": 0.9136368632316589, "step": 4160 }, { "epoch": 2.24, "learning_rate": 4.256999551299449e-08, "logits/chosen": -2.102524757385254, "logits/rejected": -2.3386802673339844, "logps/chosen": -6.7688069343566895, "logps/rejected": -6.344069004058838, "loss": 0.7106, "rewards/accuracies": 0.0, "rewards/chosen": 0.9459671378135681, "rewards/margins": -0.034551799297332764, "rewards/rejected": 0.9805189371109009, "step": 4161 }, { "epoch": 2.24, "learning_rate": 4.2548400832931893e-08, "logits/chosen": -2.1399755477905273, "logits/rejected": -2.1345441341400146, "logps/chosen": -4.3974456787109375, "logps/rejected": -7.556901931762695, "loss": 0.5816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9495014548301697, "rewards/margins": 0.23708653450012207, "rewards/rejected": 0.7124149203300476, "step": 4162 }, { "epoch": 2.25, "learning_rate": 4.252680757431117e-08, "logits/chosen": -2.0631773471832275, "logits/rejected": -2.0678517818450928, "logps/chosen": -0.6938024163246155, "logps/rejected": -2.736279249191284, "loss": 0.4782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0664639472961426, "rewards/margins": 0.48912984132766724, "rewards/rejected": 0.5773341059684753, "step": 4163 }, { "epoch": 2.25, "learning_rate": 4.250521574125136e-08, "logits/chosen": -2.003877878189087, "logits/rejected": -2.0011956691741943, "logps/chosen": -0.2809809744358063, "logps/rejected": -5.742734909057617, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 1.0521008968353271, "rewards/margins": 0.6863486766815186, "rewards/rejected": 0.3657522201538086, "step": 4164 }, { "epoch": 2.25, "learning_rate": 4.2483625337871266e-08, "logits/chosen": -2.1418707370758057, "logits/rejected": -2.1439826488494873, "logps/chosen": -0.8170989751815796, "logps/rejected": -2.7816598415374756, "loss": 0.5448, "rewards/accuracies": 1.0, "rewards/chosen": 1.076497197151184, "rewards/margins": 0.3225345015525818, "rewards/rejected": 0.7539626955986023, "step": 4165 }, { "epoch": 2.25, "learning_rate": 4.246203636828939e-08, "logits/chosen": -2.1043193340301514, "logits/rejected": -2.3211252689361572, "logps/chosen": -1.005791187286377, "logps/rejected": -0.9495133757591248, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9659363031387329, "rewards/margins": 0.015062928199768066, "rewards/rejected": 0.9508733749389648, "step": 4166 }, { "epoch": 2.25, "learning_rate": 4.2440448836623976e-08, "logits/chosen": -2.1408283710479736, "logits/rejected": -2.13430118560791, "logps/chosen": -2.7618706226348877, "logps/rejected": -2.7821388244628906, "loss": 0.4194, "rewards/accuracies": 1.0, "rewards/chosen": 1.3171541690826416, "rewards/margins": 0.6520426273345947, "rewards/rejected": 0.6651115417480469, "step": 4167 }, { "epoch": 2.25, "learning_rate": 4.241886274699297e-08, "logits/chosen": -2.08516263961792, "logits/rejected": -2.085273265838623, "logps/chosen": -0.5080225467681885, "logps/rejected": -2.8989880084991455, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": 0.826080322265625, "rewards/margins": 0.24949771165847778, "rewards/rejected": 0.5765826106071472, "step": 4168 }, { "epoch": 2.25, "learning_rate": 4.2397278103514105e-08, "logits/chosen": -2.08294677734375, "logits/rejected": -2.0881004333496094, "logps/chosen": -2.4831347465515137, "logps/rejected": -4.916684627532959, "loss": 0.3931, "rewards/accuracies": 1.0, "rewards/chosen": 1.2332277297973633, "rewards/margins": 0.7306199669837952, "rewards/rejected": 0.5026077628135681, "step": 4169 }, { "epoch": 2.25, "learning_rate": 4.2375694910304763e-08, "logits/chosen": -2.0538272857666016, "logits/rejected": -2.05055832862854, "logps/chosen": -2.1173408031463623, "logps/rejected": -5.967217445373535, "loss": 0.3849, "rewards/accuracies": 1.0, "rewards/chosen": 1.2607907056808472, "rewards/margins": 0.756241500377655, "rewards/rejected": 0.5045492053031921, "step": 4170 }, { "epoch": 2.25, "learning_rate": 4.235411317148209e-08, "logits/chosen": -2.0742547512054443, "logits/rejected": -2.301710605621338, "logps/chosen": -3.8642184734344482, "logps/rejected": -3.071176528930664, "loss": 0.7207, "rewards/accuracies": 0.0, "rewards/chosen": 0.5915701985359192, "rewards/margins": -0.05435401201248169, "rewards/rejected": 0.6459242105484009, "step": 4171 }, { "epoch": 2.25, "learning_rate": 4.233253289116294e-08, "logits/chosen": -2.006652593612671, "logits/rejected": -1.99714994430542, "logps/chosen": -4.630311965942383, "logps/rejected": -5.739654064178467, "loss": 0.2645, "rewards/accuracies": 1.0, "rewards/chosen": 1.617762804031372, "rewards/margins": 1.194921612739563, "rewards/rejected": 0.42284122109413147, "step": 4172 }, { "epoch": 2.25, "learning_rate": 4.23109540734639e-08, "logits/chosen": -1.9719005823135376, "logits/rejected": -1.973118782043457, "logps/chosen": -3.690324306488037, "logps/rejected": -1.9765148162841797, "loss": 0.3664, "rewards/accuracies": 1.0, "rewards/chosen": 1.4913583993911743, "rewards/margins": 0.8152397274971008, "rewards/rejected": 0.6761186718940735, "step": 4173 }, { "epoch": 2.25, "learning_rate": 4.2289376722501266e-08, "logits/chosen": -2.1936964988708496, "logits/rejected": -2.308926820755005, "logps/chosen": -5.702921390533447, "logps/rejected": -30.616050720214844, "loss": 0.3779, "rewards/accuracies": 1.0, "rewards/chosen": 1.4328988790512085, "rewards/margins": 0.7782811522483826, "rewards/rejected": 0.6546177268028259, "step": 4174 }, { "epoch": 2.25, "learning_rate": 4.226780084239107e-08, "logits/chosen": -2.105806350708008, "logits/rejected": -2.1125881671905518, "logps/chosen": -2.27437686920166, "logps/rejected": -7.2604475021362305, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 1.2491507530212402, "rewards/margins": 0.9827281832695007, "rewards/rejected": 0.2664225697517395, "step": 4175 }, { "epoch": 2.25, "learning_rate": 4.224622643724904e-08, "logits/chosen": -2.204697847366333, "logits/rejected": -2.1530091762542725, "logps/chosen": -14.89425277709961, "logps/rejected": -4.538097858428955, "loss": 0.2321, "rewards/accuracies": 1.0, "rewards/chosen": 1.838401436805725, "rewards/margins": 1.3424054384231567, "rewards/rejected": 0.49599596858024597, "step": 4176 }, { "epoch": 2.25, "learning_rate": 4.2224653511190645e-08, "logits/chosen": -1.989047884941101, "logits/rejected": -2.2891087532043457, "logps/chosen": -0.5285648107528687, "logps/rejected": -0.5411854386329651, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 1.0748087167739868, "rewards/margins": 0.013162612915039062, "rewards/rejected": 1.0616461038589478, "step": 4177 }, { "epoch": 2.25, "learning_rate": 4.220308206833107e-08, "logits/chosen": -2.005638599395752, "logits/rejected": -2.01690411567688, "logps/chosen": -0.7046871781349182, "logps/rejected": -10.948002815246582, "loss": 0.5279, "rewards/accuracies": 1.0, "rewards/chosen": 1.039537787437439, "rewards/margins": 0.3632912039756775, "rewards/rejected": 0.6762465834617615, "step": 4178 }, { "epoch": 2.25, "learning_rate": 4.2181512112785187e-08, "logits/chosen": -2.090608596801758, "logits/rejected": -2.0791947841644287, "logps/chosen": -0.2588823139667511, "logps/rejected": -7.6928582191467285, "loss": 0.401, "rewards/accuracies": 1.0, "rewards/chosen": 1.0252805948257446, "rewards/margins": 0.7066353559494019, "rewards/rejected": 0.3186452388763428, "step": 4179 }, { "epoch": 2.25, "learning_rate": 4.215994364866764e-08, "logits/chosen": -2.1224215030670166, "logits/rejected": -2.341787815093994, "logps/chosen": -0.48995479941368103, "logps/rejected": -21.931310653686523, "loss": 0.5953, "rewards/accuracies": 1.0, "rewards/chosen": 1.9086703062057495, "rewards/margins": 0.20625877380371094, "rewards/rejected": 1.7024115324020386, "step": 4180 }, { "epoch": 2.26, "learning_rate": 4.2138376680092736e-08, "logits/chosen": -2.1669363975524902, "logits/rejected": -2.2454168796539307, "logps/chosen": -4.492936134338379, "logps/rejected": -15.754101753234863, "loss": 0.4868, "rewards/accuracies": 1.0, "rewards/chosen": 1.203457236289978, "rewards/margins": 0.4666849970817566, "rewards/rejected": 0.7367722392082214, "step": 4181 }, { "epoch": 2.26, "learning_rate": 4.211681121117452e-08, "logits/chosen": -2.078378915786743, "logits/rejected": -2.2441160678863525, "logps/chosen": -0.41713398694992065, "logps/rejected": -0.40973982214927673, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.8586196899414062, "rewards/margins": 0.025641143321990967, "rewards/rejected": 0.8329785466194153, "step": 4182 }, { "epoch": 2.26, "learning_rate": 4.209524724602676e-08, "logits/chosen": -2.0524697303771973, "logits/rejected": -2.058972120285034, "logps/chosen": -2.734088182449341, "logps/rejected": -6.165492057800293, "loss": 0.3973, "rewards/accuracies": 1.0, "rewards/chosen": 1.0802582502365112, "rewards/margins": 0.7178694009780884, "rewards/rejected": 0.36238881945610046, "step": 4183 }, { "epoch": 2.26, "learning_rate": 4.2073684788762906e-08, "logits/chosen": -2.162935733795166, "logits/rejected": -2.159461498260498, "logps/chosen": -2.7936785221099854, "logps/rejected": -7.743832588195801, "loss": 0.3508, "rewards/accuracies": 1.0, "rewards/chosen": 1.5580675601959229, "rewards/margins": 0.8671601414680481, "rewards/rejected": 0.6909074187278748, "step": 4184 }, { "epoch": 2.26, "learning_rate": 4.205212384349614e-08, "logits/chosen": -2.171527624130249, "logits/rejected": -2.286207914352417, "logps/chosen": -3.12947678565979, "logps/rejected": -2.9342362880706787, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.5208610892295837, "rewards/margins": 0.019275128841400146, "rewards/rejected": 0.5015859603881836, "step": 4185 }, { "epoch": 2.26, "learning_rate": 4.2030564414339395e-08, "logits/chosen": -2.07974910736084, "logits/rejected": -2.079314708709717, "logps/chosen": -0.8481993675231934, "logps/rejected": -3.887744903564453, "loss": 0.4966, "rewards/accuracies": 1.0, "rewards/chosen": 1.1830272674560547, "rewards/margins": 0.44145041704177856, "rewards/rejected": 0.7415768504142761, "step": 4186 }, { "epoch": 2.26, "learning_rate": 4.200900650540522e-08, "logits/chosen": -2.145634889602661, "logits/rejected": -2.3226654529571533, "logps/chosen": -0.41415804624557495, "logps/rejected": -0.4874059557914734, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427618980407715, "rewards/margins": 0.016063034534454346, "rewards/rejected": 0.9266988635063171, "step": 4187 }, { "epoch": 2.26, "learning_rate": 4.198745012080595e-08, "logits/chosen": -2.0179154872894287, "logits/rejected": -2.0275354385375977, "logps/chosen": -2.483771800994873, "logps/rejected": -1.8827214241027832, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": 1.3516008853912354, "rewards/margins": 0.6896761059761047, "rewards/rejected": 0.6619247794151306, "step": 4188 }, { "epoch": 2.26, "learning_rate": 4.1965895264653614e-08, "logits/chosen": -2.0541300773620605, "logits/rejected": -2.06290864944458, "logps/chosen": -1.9588819742202759, "logps/rejected": -1.390572428703308, "loss": 0.3431, "rewards/accuracies": 1.0, "rewards/chosen": 1.637590765953064, "rewards/margins": 0.8934254050254822, "rewards/rejected": 0.7441653609275818, "step": 4189 }, { "epoch": 2.26, "learning_rate": 4.194434194105992e-08, "logits/chosen": -2.129741907119751, "logits/rejected": -2.1283421516418457, "logps/chosen": -4.874218940734863, "logps/rejected": -9.503449440002441, "loss": 0.2079, "rewards/accuracies": 1.0, "rewards/chosen": 1.4547877311706543, "rewards/margins": 1.464787244796753, "rewards/rejected": -0.009999466128647327, "step": 4190 }, { "epoch": 2.26, "learning_rate": 4.192279015413634e-08, "logits/chosen": -2.193701982498169, "logits/rejected": -2.203200578689575, "logps/chosen": -1.4685733318328857, "logps/rejected": -3.82912540435791, "loss": 0.4397, "rewards/accuracies": 1.0, "rewards/chosen": 1.0307681560516357, "rewards/margins": 0.5937975645065308, "rewards/rejected": 0.43697062134742737, "step": 4191 }, { "epoch": 2.26, "learning_rate": 4.190123990799402e-08, "logits/chosen": -2.1143927574157715, "logits/rejected": -2.2608516216278076, "logps/chosen": -1.7510697841644287, "logps/rejected": -1.6712669134140015, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8080782890319824, "rewards/margins": 0.01986175775527954, "rewards/rejected": 0.7882165312767029, "step": 4192 }, { "epoch": 2.26, "learning_rate": 4.18796912067438e-08, "logits/chosen": -2.0265872478485107, "logits/rejected": -2.032994031906128, "logps/chosen": -1.496902585029602, "logps/rejected": -6.234838485717773, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 1.118386149406433, "rewards/margins": 0.8643958568572998, "rewards/rejected": 0.2539902627468109, "step": 4193 }, { "epoch": 2.26, "learning_rate": 4.1858144054496223e-08, "logits/chosen": -2.1300060749053955, "logits/rejected": -2.1224284172058105, "logps/chosen": -0.6347125172615051, "logps/rejected": -13.792693138122559, "loss": 0.364, "rewards/accuracies": 1.0, "rewards/chosen": 1.234210729598999, "rewards/margins": 0.8230555057525635, "rewards/rejected": 0.41115522384643555, "step": 4194 }, { "epoch": 2.26, "learning_rate": 4.1836598455361574e-08, "logits/chosen": -2.157580614089966, "logits/rejected": -2.16172456741333, "logps/chosen": -1.8978368043899536, "logps/rejected": -3.3601038455963135, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 1.048767328262329, "rewards/margins": 0.4796324372291565, "rewards/rejected": 0.5691348910331726, "step": 4195 }, { "epoch": 2.26, "learning_rate": 4.18150544134498e-08, "logits/chosen": -2.061411142349243, "logits/rejected": -2.0505659580230713, "logps/chosen": -4.57761287689209, "logps/rejected": -2.1840217113494873, "loss": 0.3788, "rewards/accuracies": 1.0, "rewards/chosen": 1.6955108642578125, "rewards/margins": 0.7754839658737183, "rewards/rejected": 0.9200268983840942, "step": 4196 }, { "epoch": 2.26, "learning_rate": 4.179351193287058e-08, "logits/chosen": -2.1361114978790283, "logits/rejected": -2.2695796489715576, "logps/chosen": -7.259548187255859, "logps/rejected": -7.342047691345215, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 1.051144003868103, "rewards/margins": 0.0033708810806274414, "rewards/rejected": 1.0477731227874756, "step": 4197 }, { "epoch": 2.26, "learning_rate": 4.177197101773329e-08, "logits/chosen": -2.0898733139038086, "logits/rejected": -2.0904769897460938, "logps/chosen": -1.1923021078109741, "logps/rejected": -2.2150111198425293, "loss": 0.6181, "rewards/accuracies": 1.0, "rewards/chosen": 1.0414472818374634, "rewards/margins": 0.15613311529159546, "rewards/rejected": 0.8853141665458679, "step": 4198 }, { "epoch": 2.26, "learning_rate": 4.175043167214699e-08, "logits/chosen": -2.102348804473877, "logits/rejected": -2.037977933883667, "logps/chosen": -18.46829605102539, "logps/rejected": -3.4497909545898438, "loss": 0.2663, "rewards/accuracies": 1.0, "rewards/chosen": 1.8648197650909424, "rewards/margins": 1.187009334564209, "rewards/rejected": 0.6778104901313782, "step": 4199 }, { "epoch": 2.27, "learning_rate": 4.172889390022046e-08, "logits/chosen": -2.0082058906555176, "logits/rejected": -2.284397840499878, "logps/chosen": -1.4631866216659546, "logps/rejected": -1.2398186922073364, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.7964769005775452, "rewards/margins": -0.0037780404090881348, "rewards/rejected": 0.8002549409866333, "step": 4200 }, { "epoch": 2.27, "learning_rate": 4.170735770606216e-08, "logits/chosen": -2.2139809131622314, "logits/rejected": -2.2113847732543945, "logps/chosen": -0.6302396059036255, "logps/rejected": -8.146212577819824, "loss": 0.3463, "rewards/accuracies": 1.0, "rewards/chosen": 1.0781272649765015, "rewards/margins": 0.8822322487831116, "rewards/rejected": 0.1958950012922287, "step": 4201 }, { "epoch": 2.27, "learning_rate": 4.168582309378028e-08, "logits/chosen": -1.9579485654830933, "logits/rejected": -2.298304319381714, "logps/chosen": -2.964247703552246, "logps/rejected": -3.678436756134033, "loss": 0.6567, "rewards/accuracies": 1.0, "rewards/chosen": 1.054386019706726, "rewards/margins": 0.07427948713302612, "rewards/rejected": 0.9801065325737, "step": 4202 }, { "epoch": 2.27, "learning_rate": 4.166429006748268e-08, "logits/chosen": -2.1109962463378906, "logits/rejected": -2.280120372772217, "logps/chosen": -0.9428544044494629, "logps/rejected": -0.9735074043273926, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 1.1971092224121094, "rewards/margins": 0.024933934211730957, "rewards/rejected": 1.1721752882003784, "step": 4203 }, { "epoch": 2.27, "learning_rate": 4.1642758631276933e-08, "logits/chosen": -1.9531080722808838, "logits/rejected": -2.2851243019104004, "logps/chosen": -0.8721975684165955, "logps/rejected": -1.0271166563034058, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.8827806711196899, "rewards/margins": 0.0034436583518981934, "rewards/rejected": 0.8793370127677917, "step": 4204 }, { "epoch": 2.27, "learning_rate": 4.1621228789270296e-08, "logits/chosen": -2.0155625343322754, "logits/rejected": -2.0134568214416504, "logps/chosen": -2.81699275970459, "logps/rejected": -6.066669464111328, "loss": 0.248, "rewards/accuracies": 1.0, "rewards/chosen": 1.5755537748336792, "rewards/margins": 1.2679731845855713, "rewards/rejected": 0.3075805604457855, "step": 4205 }, { "epoch": 2.27, "learning_rate": 4.159970054556974e-08, "logits/chosen": -2.0916802883148193, "logits/rejected": -2.31103515625, "logps/chosen": -0.24294909834861755, "logps/rejected": -0.28418856859207153, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9249327778816223, "rewards/margins": 0.022119343280792236, "rewards/rejected": 0.9028134346008301, "step": 4206 }, { "epoch": 2.27, "learning_rate": 4.15781739042819e-08, "logits/chosen": -2.1448962688446045, "logits/rejected": -2.1531307697296143, "logps/chosen": -2.222757339477539, "logps/rejected": -5.097355365753174, "loss": 0.3948, "rewards/accuracies": 1.0, "rewards/chosen": 1.149819254875183, "rewards/margins": 0.7253443002700806, "rewards/rejected": 0.42447495460510254, "step": 4207 }, { "epoch": 2.27, "learning_rate": 4.155664886951315e-08, "logits/chosen": -2.2060766220092773, "logits/rejected": -2.2183163166046143, "logps/chosen": -1.515640377998352, "logps/rejected": -4.126933574676514, "loss": 0.413, "rewards/accuracies": 1.0, "rewards/chosen": 1.3487522602081299, "rewards/margins": 0.6706934571266174, "rewards/rejected": 0.6780588030815125, "step": 4208 }, { "epoch": 2.27, "learning_rate": 4.153512544536951e-08, "logits/chosen": -2.0403246879577637, "logits/rejected": -2.0503244400024414, "logps/chosen": -1.2912923097610474, "logps/rejected": -2.7363641262054443, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 1.0696135759353638, "rewards/margins": 0.47958701848983765, "rewards/rejected": 0.5900265574455261, "step": 4209 }, { "epoch": 2.27, "learning_rate": 4.1513603635956744e-08, "logits/chosen": -2.075054168701172, "logits/rejected": -2.2585291862487793, "logps/chosen": -1.040727138519287, "logps/rejected": -0.9602599143981934, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.830763041973114, "rewards/margins": 0.014906466007232666, "rewards/rejected": 0.8158565759658813, "step": 4210 }, { "epoch": 2.27, "learning_rate": 4.1492083445380255e-08, "logits/chosen": -2.0886969566345215, "logits/rejected": -2.037848472595215, "logps/chosen": -25.62740707397461, "logps/rejected": -3.300154447555542, "loss": 0.2898, "rewards/accuracies": 1.0, "rewards/chosen": 1.8259304761886597, "rewards/margins": 1.0903162956237793, "rewards/rejected": 0.7356141209602356, "step": 4211 }, { "epoch": 2.27, "learning_rate": 4.1470564877745165e-08, "logits/chosen": -2.0663721561431885, "logits/rejected": -2.2805609703063965, "logps/chosen": -0.5819931030273438, "logps/rejected": -0.5923643708229065, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9205547571182251, "rewards/margins": 0.018552720546722412, "rewards/rejected": 0.9020020365715027, "step": 4212 }, { "epoch": 2.27, "learning_rate": 4.1449047937156255e-08, "logits/chosen": -2.1173245906829834, "logits/rejected": -2.0553359985351562, "logps/chosen": -12.104154586791992, "logps/rejected": -22.290430068969727, "loss": 0.1953, "rewards/accuracies": 1.0, "rewards/chosen": 1.7571715116500854, "rewards/margins": 1.5337430238723755, "rewards/rejected": 0.22342853248119354, "step": 4213 }, { "epoch": 2.27, "learning_rate": 4.1427532627718086e-08, "logits/chosen": -1.9939463138580322, "logits/rejected": -1.9956287145614624, "logps/chosen": -0.9272079467773438, "logps/rejected": -1.7756190299987793, "loss": 0.5121, "rewards/accuracies": 1.0, "rewards/chosen": 1.1642496585845947, "rewards/margins": 0.40219974517822266, "rewards/rejected": 0.7620499134063721, "step": 4214 }, { "epoch": 2.27, "learning_rate": 4.140601895353481e-08, "logits/chosen": -2.0776495933532715, "logits/rejected": -1.9696638584136963, "logps/chosen": -28.442138671875, "logps/rejected": -1.8374311923980713, "loss": 0.1722, "rewards/accuracies": 1.0, "rewards/chosen": 2.4877781867980957, "rewards/margins": 1.6719869375228882, "rewards/rejected": 0.8157912492752075, "step": 4215 }, { "epoch": 2.27, "learning_rate": 4.138450691871033e-08, "logits/chosen": -1.9929813146591187, "logits/rejected": -2.2900686264038086, "logps/chosen": -2.101048469543457, "logps/rejected": -1.7651304006576538, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8872065544128418, "rewards/margins": 0.021130621433258057, "rewards/rejected": 0.8660759329795837, "step": 4216 }, { "epoch": 2.27, "learning_rate": 4.136299652734817e-08, "logits/chosen": -2.077996015548706, "logits/rejected": -2.297693967819214, "logps/chosen": -2.2969460487365723, "logps/rejected": -2.442295551300049, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8857599496841431, "rewards/margins": 0.016370952129364014, "rewards/rejected": 0.869388997554779, "step": 4217 }, { "epoch": 2.28, "learning_rate": 4.134148778355159e-08, "logits/chosen": -2.0106546878814697, "logits/rejected": -2.005727529525757, "logps/chosen": -3.912574529647827, "logps/rejected": -2.25628399848938, "loss": 0.4295, "rewards/accuracies": 1.0, "rewards/chosen": 1.4107319116592407, "rewards/margins": 0.622642993927002, "rewards/rejected": 0.7880889177322388, "step": 4218 }, { "epoch": 2.28, "learning_rate": 4.131998069142354e-08, "logits/chosen": -2.0870821475982666, "logits/rejected": -2.037260055541992, "logps/chosen": -27.006649017333984, "logps/rejected": -2.1103343963623047, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 2.0556957721710205, "rewards/margins": 1.2624361515045166, "rewards/rejected": 0.7932596206665039, "step": 4219 }, { "epoch": 2.28, "learning_rate": 4.1298475255066634e-08, "logits/chosen": -2.1089251041412354, "logits/rejected": -2.3538639545440674, "logps/chosen": -1.094269871711731, "logps/rejected": -1.0563277006149292, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 1.0297883749008179, "rewards/margins": 0.04780852794647217, "rewards/rejected": 0.9819798469543457, "step": 4220 }, { "epoch": 2.28, "learning_rate": 4.1276971478583164e-08, "logits/chosen": -2.0395641326904297, "logits/rejected": -2.0498292446136475, "logps/chosen": -2.436932325363159, "logps/rejected": -2.201206922531128, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 1.009807825088501, "rewards/margins": 0.5089244246482849, "rewards/rejected": 0.5008834004402161, "step": 4221 }, { "epoch": 2.28, "learning_rate": 4.125546936607514e-08, "logits/chosen": -1.994404911994934, "logits/rejected": -1.9983917474746704, "logps/chosen": -1.9239810705184937, "logps/rejected": -3.192277431488037, "loss": 0.5441, "rewards/accuracies": 1.0, "rewards/chosen": 1.0218079090118408, "rewards/margins": 0.32431286573410034, "rewards/rejected": 0.6974950432777405, "step": 4222 }, { "epoch": 2.28, "learning_rate": 4.1233968921644215e-08, "logits/chosen": -2.115281343460083, "logits/rejected": -1.9839025735855103, "logps/chosen": -16.938549041748047, "logps/rejected": -8.388545036315918, "loss": 0.3017, "rewards/accuracies": 1.0, "rewards/chosen": 1.873449683189392, "rewards/margins": 1.0436944961547852, "rewards/rejected": 0.8297551274299622, "step": 4223 }, { "epoch": 2.28, "learning_rate": 4.121247014939174e-08, "logits/chosen": -2.095536708831787, "logits/rejected": -2.1987709999084473, "logps/chosen": -0.9775358438491821, "logps/rejected": -25.883651733398438, "loss": 0.2726, "rewards/accuracies": 1.0, "rewards/chosen": 1.1753017902374268, "rewards/margins": 1.160343050956726, "rewards/rejected": 0.014958763495087624, "step": 4224 }, { "epoch": 2.28, "learning_rate": 4.119097305341876e-08, "logits/chosen": -2.0636587142944336, "logits/rejected": -2.0515658855438232, "logps/chosen": -18.465784072875977, "logps/rejected": -1.136091709136963, "loss": 0.2121, "rewards/accuracies": 1.0, "rewards/chosen": 2.2837958335876465, "rewards/margins": 1.4425486326217651, "rewards/rejected": 0.8412472009658813, "step": 4225 }, { "epoch": 2.28, "learning_rate": 4.1169477637825986e-08, "logits/chosen": -2.140629291534424, "logits/rejected": -2.300194263458252, "logps/chosen": -2.899350166320801, "logps/rejected": -2.8529622554779053, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9793853759765625, "rewards/margins": 0.023384153842926025, "rewards/rejected": 0.9560012221336365, "step": 4226 }, { "epoch": 2.28, "learning_rate": 4.11479839067138e-08, "logits/chosen": -2.070002794265747, "logits/rejected": -2.275460720062256, "logps/chosen": -0.39051535725593567, "logps/rejected": -5.008418560028076, "loss": 0.5638, "rewards/accuracies": 1.0, "rewards/chosen": 1.0016616582870483, "rewards/margins": 0.2779915928840637, "rewards/rejected": 0.7236700654029846, "step": 4227 }, { "epoch": 2.28, "learning_rate": 4.112649186418229e-08, "logits/chosen": -2.064297914505005, "logits/rejected": -2.221792697906494, "logps/chosen": -0.7502805590629578, "logps/rejected": -0.8409544825553894, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 1.0778312683105469, "rewards/margins": 0.0057212114334106445, "rewards/rejected": 1.0721100568771362, "step": 4228 }, { "epoch": 2.28, "learning_rate": 4.11050015143312e-08, "logits/chosen": -1.9992178678512573, "logits/rejected": -2.2937979698181152, "logps/chosen": -0.20750932395458221, "logps/rejected": -0.21085835993289948, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.7986759543418884, "rewards/margins": 0.004651308059692383, "rewards/rejected": 0.794024646282196, "step": 4229 }, { "epoch": 2.28, "learning_rate": 4.108351286125994e-08, "logits/chosen": -2.128688097000122, "logits/rejected": -2.14115047454834, "logps/chosen": -7.218433856964111, "logps/rejected": -2.13356351852417, "loss": 0.5337, "rewards/accuracies": 1.0, "rewards/chosen": 1.0988807678222656, "rewards/margins": 0.34916698932647705, "rewards/rejected": 0.7497137784957886, "step": 4230 }, { "epoch": 2.28, "learning_rate": 4.1062025909067634e-08, "logits/chosen": -2.003018856048584, "logits/rejected": -2.019254207611084, "logps/chosen": -1.421442985534668, "logps/rejected": -8.658051490783691, "loss": 0.4532, "rewards/accuracies": 1.0, "rewards/chosen": 1.1679507493972778, "rewards/margins": 0.5563718676567078, "rewards/rejected": 0.6115788817405701, "step": 4231 }, { "epoch": 2.28, "learning_rate": 4.104054066185305e-08, "logits/chosen": -2.133197546005249, "logits/rejected": -2.1342992782592773, "logps/chosen": -0.2690787613391876, "logps/rejected": -5.152050971984863, "loss": 0.4671, "rewards/accuracies": 1.0, "rewards/chosen": 0.9838973879814148, "rewards/margins": 0.5185810923576355, "rewards/rejected": 0.4653162956237793, "step": 4232 }, { "epoch": 2.28, "learning_rate": 4.101905712371466e-08, "logits/chosen": -2.208266258239746, "logits/rejected": -2.2076053619384766, "logps/chosen": -1.564515233039856, "logps/rejected": -7.09591007232666, "loss": 0.3476, "rewards/accuracies": 1.0, "rewards/chosen": 1.1156682968139648, "rewards/margins": 0.8778462409973145, "rewards/rejected": 0.2378220558166504, "step": 4233 }, { "epoch": 2.28, "learning_rate": 4.099757529875056e-08, "logits/chosen": -1.9797561168670654, "logits/rejected": -2.2660577297210693, "logps/chosen": -8.567438125610352, "logps/rejected": -9.262127876281738, "loss": 0.6544, "rewards/accuracies": 1.0, "rewards/chosen": 0.49512979388237, "rewards/margins": 0.07902213931083679, "rewards/rejected": 0.4161076545715332, "step": 4234 }, { "epoch": 2.28, "learning_rate": 4.097609519105855e-08, "logits/chosen": -1.9605915546417236, "logits/rejected": -1.929474115371704, "logps/chosen": -9.952930450439453, "logps/rejected": -5.596315383911133, "loss": 0.4204, "rewards/accuracies": 1.0, "rewards/chosen": 1.0081971883773804, "rewards/margins": 0.6491149663925171, "rewards/rejected": 0.3590822219848633, "step": 4235 }, { "epoch": 2.28, "learning_rate": 4.095461680473613e-08, "logits/chosen": -2.1591033935546875, "logits/rejected": -2.164964199066162, "logps/chosen": -2.6764326095581055, "logps/rejected": -1.9849048852920532, "loss": 0.6313, "rewards/accuracies": 1.0, "rewards/chosen": 0.9828214645385742, "rewards/margins": 0.12781602144241333, "rewards/rejected": 0.8550054430961609, "step": 4236 }, { "epoch": 2.29, "learning_rate": 4.093314014388043e-08, "logits/chosen": -2.1617507934570312, "logits/rejected": -2.3710217475891113, "logps/chosen": -6.054139614105225, "logps/rejected": -5.894509792327881, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.8697635531425476, "rewards/margins": -0.0011383295059204102, "rewards/rejected": 0.870901882648468, "step": 4237 }, { "epoch": 2.29, "learning_rate": 4.0911665212588275e-08, "logits/chosen": -2.186915636062622, "logits/rejected": -2.1779253482818604, "logps/chosen": -2.6432952880859375, "logps/rejected": -4.297490119934082, "loss": 0.384, "rewards/accuracies": 1.0, "rewards/chosen": 1.2020319700241089, "rewards/margins": 0.7590019702911377, "rewards/rejected": 0.4430299699306488, "step": 4238 }, { "epoch": 2.29, "learning_rate": 4.0890192014956146e-08, "logits/chosen": -2.006927251815796, "logits/rejected": -2.0095510482788086, "logps/chosen": -3.0534071922302246, "logps/rejected": -7.717296600341797, "loss": 0.4553, "rewards/accuracies": 1.0, "rewards/chosen": 0.9185255169868469, "rewards/margins": 0.5505087375640869, "rewards/rejected": 0.3680168092250824, "step": 4239 }, { "epoch": 2.29, "learning_rate": 4.086872055508019e-08, "logits/chosen": -2.1359238624572754, "logits/rejected": -2.282656192779541, "logps/chosen": -0.7374801635742188, "logps/rejected": -0.7558870315551758, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 1.089347243309021, "rewards/margins": 0.008486270904541016, "rewards/rejected": 1.08086097240448, "step": 4240 }, { "epoch": 2.29, "learning_rate": 4.084725083705624e-08, "logits/chosen": -2.029790163040161, "logits/rejected": -2.4069180488586426, "logps/chosen": -0.6543055176734924, "logps/rejected": -24.642148971557617, "loss": 0.8603, "rewards/accuracies": 0.0, "rewards/chosen": 1.036966323852539, "rewards/margins": -0.31027090549468994, "rewards/rejected": 1.347237229347229, "step": 4241 }, { "epoch": 2.29, "learning_rate": 4.082578286497976e-08, "logits/chosen": -2.0561294555664062, "logits/rejected": -2.0648815631866455, "logps/chosen": -2.7969400882720947, "logps/rejected": -10.710779190063477, "loss": 0.3455, "rewards/accuracies": 1.0, "rewards/chosen": 1.2907941341400146, "rewards/margins": 0.8849136829376221, "rewards/rejected": 0.4058804512023926, "step": 4242 }, { "epoch": 2.29, "learning_rate": 4.0804316642945945e-08, "logits/chosen": -2.0524044036865234, "logits/rejected": -2.172205924987793, "logps/chosen": -0.516486406326294, "logps/rejected": -0.5192522406578064, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.9725031852722168, "rewards/margins": 0.018885791301727295, "rewards/rejected": 0.9536173939704895, "step": 4243 }, { "epoch": 2.29, "learning_rate": 4.0782852175049603e-08, "logits/chosen": -2.257702350616455, "logits/rejected": -2.1079726219177246, "logps/chosen": -50.42976379394531, "logps/rejected": -21.48337173461914, "loss": 0.1477, "rewards/accuracies": 1.0, "rewards/chosen": 2.901371717453003, "rewards/margins": 1.8379695415496826, "rewards/rejected": 1.0634021759033203, "step": 4244 }, { "epoch": 2.29, "learning_rate": 4.076138946538522e-08, "logits/chosen": -2.2556984424591064, "logits/rejected": -2.2755379676818848, "logps/chosen": -12.439824104309082, "logps/rejected": -15.391806602478027, "loss": 0.4798, "rewards/accuracies": 1.0, "rewards/chosen": 1.820556640625, "rewards/margins": 0.4848548173904419, "rewards/rejected": 1.335701823234558, "step": 4245 }, { "epoch": 2.29, "learning_rate": 4.073992851804695e-08, "logits/chosen": -1.9981067180633545, "logits/rejected": -2.325568914413452, "logps/chosen": -0.47445210814476013, "logps/rejected": -0.48565593361854553, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 1.0470718145370483, "rewards/margins": 0.0405806303024292, "rewards/rejected": 1.0064911842346191, "step": 4246 }, { "epoch": 2.29, "learning_rate": 4.071846933712861e-08, "logits/chosen": -2.15238881111145, "logits/rejected": -2.308985948562622, "logps/chosen": -1.148818016052246, "logps/rejected": -0.9561935067176819, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.8924574851989746, "rewards/margins": 0.014899730682373047, "rewards/rejected": 0.8775577545166016, "step": 4247 }, { "epoch": 2.29, "learning_rate": 4.069701192672369e-08, "logits/chosen": -2.0362095832824707, "logits/rejected": -2.0380048751831055, "logps/chosen": -3.9229862689971924, "logps/rejected": -2.7578623294830322, "loss": 0.3016, "rewards/accuracies": 1.0, "rewards/chosen": 1.5925564765930176, "rewards/margins": 1.04402756690979, "rewards/rejected": 0.5485289692878723, "step": 4248 }, { "epoch": 2.29, "learning_rate": 4.0675556290925335e-08, "logits/chosen": -2.2697813510894775, "logits/rejected": -2.1649169921875, "logps/chosen": -25.06604766845703, "logps/rejected": -4.423977851867676, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": 2.0759849548339844, "rewards/margins": 1.6551352739334106, "rewards/rejected": 0.4208497107028961, "step": 4249 }, { "epoch": 2.29, "learning_rate": 4.0654102433826334e-08, "logits/chosen": -2.1190617084503174, "logits/rejected": -2.2966315746307373, "logps/chosen": -0.23729170858860016, "logps/rejected": -0.26732030510902405, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8561736941337585, "rewards/margins": 0.01098257303237915, "rewards/rejected": 0.8451911211013794, "step": 4250 }, { "epoch": 2.29, "learning_rate": 4.063265035951917e-08, "logits/chosen": -2.125131607055664, "logits/rejected": -2.0504837036132812, "logps/chosen": -19.457216262817383, "logps/rejected": -2.839460611343384, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 1.5729891061782837, "rewards/margins": 0.9708148837089539, "rewards/rejected": 0.6021742224693298, "step": 4251 }, { "epoch": 2.29, "learning_rate": 4.061120007209595e-08, "logits/chosen": -2.114997625350952, "logits/rejected": -2.1218602657318115, "logps/chosen": -3.5980348587036133, "logps/rejected": -5.791287899017334, "loss": 0.55, "rewards/accuracies": 1.0, "rewards/chosen": 1.0029858350753784, "rewards/margins": 0.31030672788619995, "rewards/rejected": 0.6926791071891785, "step": 4252 }, { "epoch": 2.29, "learning_rate": 4.0589751575648466e-08, "logits/chosen": -2.1531670093536377, "logits/rejected": -2.3040289878845215, "logps/chosen": -6.490039348602295, "logps/rejected": -9.904534339904785, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 1.0193376541137695, "rewards/margins": -0.018121838569641113, "rewards/rejected": 1.0374594926834106, "step": 4253 }, { "epoch": 2.29, "learning_rate": 4.056830487426817e-08, "logits/chosen": -2.0669500827789307, "logits/rejected": -2.065512180328369, "logps/chosen": -0.6526361107826233, "logps/rejected": -7.666699409484863, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 1.1251413822174072, "rewards/margins": 0.7616633176803589, "rewards/rejected": 0.3634780943393707, "step": 4254 }, { "epoch": 2.3, "learning_rate": 4.0546859972046147e-08, "logits/chosen": -2.164982795715332, "logits/rejected": -2.3126049041748047, "logps/chosen": -1.5442761182785034, "logps/rejected": -1.4183799028396606, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.6354812979698181, "rewards/margins": -0.0008872151374816895, "rewards/rejected": 0.6363685131072998, "step": 4255 }, { "epoch": 2.3, "learning_rate": 4.0525416873073176e-08, "logits/chosen": -2.0537450313568115, "logits/rejected": -2.2849671840667725, "logps/chosen": -4.330192565917969, "logps/rejected": -4.228662490844727, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 1.081062912940979, "rewards/margins": 0.008983850479125977, "rewards/rejected": 1.072079062461853, "step": 4256 }, { "epoch": 2.3, "learning_rate": 4.0503975581439644e-08, "logits/chosen": -2.1068077087402344, "logits/rejected": -2.020540475845337, "logps/chosen": -36.47306442260742, "logps/rejected": -4.79440450668335, "loss": 0.1341, "rewards/accuracies": 1.0, "rewards/chosen": 2.196540594100952, "rewards/margins": 1.941413164138794, "rewards/rejected": 0.2551274299621582, "step": 4257 }, { "epoch": 2.3, "learning_rate": 4.048253610123561e-08, "logits/chosen": -2.1247048377990723, "logits/rejected": -2.1580588817596436, "logps/chosen": -2.39052677154541, "logps/rejected": -23.633258819580078, "loss": 0.5688, "rewards/accuracies": 1.0, "rewards/chosen": 1.1223104000091553, "rewards/margins": 0.2663869261741638, "rewards/rejected": 0.8559234738349915, "step": 4258 }, { "epoch": 2.3, "learning_rate": 4.0461098436550834e-08, "logits/chosen": -2.0651111602783203, "logits/rejected": -2.0723564624786377, "logps/chosen": -1.9055100679397583, "logps/rejected": -5.359443187713623, "loss": 0.4493, "rewards/accuracies": 1.0, "rewards/chosen": 1.0235975980758667, "rewards/margins": 0.5669121742248535, "rewards/rejected": 0.4566853940486908, "step": 4259 }, { "epoch": 2.3, "learning_rate": 4.043966259147468e-08, "logits/chosen": -2.1163570880889893, "logits/rejected": -2.3216607570648193, "logps/chosen": -2.2020864486694336, "logps/rejected": -1.0586574077606201, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 1.0077391862869263, "rewards/margins": 0.011877834796905518, "rewards/rejected": 0.9958613514900208, "step": 4260 }, { "epoch": 2.3, "learning_rate": 4.041822857009617e-08, "logits/chosen": -2.102633237838745, "logits/rejected": -2.111029863357544, "logps/chosen": -2.557345151901245, "logps/rejected": -7.094809055328369, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": 1.6085842847824097, "rewards/margins": 0.7655102610588074, "rewards/rejected": 0.8430740237236023, "step": 4261 }, { "epoch": 2.3, "learning_rate": 4.0396796376503997e-08, "logits/chosen": -2.0697171688079834, "logits/rejected": -2.061638593673706, "logps/chosen": -2.7534282207489014, "logps/rejected": -4.980573654174805, "loss": 0.4773, "rewards/accuracies": 1.0, "rewards/chosen": 1.0203226804733276, "rewards/margins": 0.4914877414703369, "rewards/rejected": 0.5288349390029907, "step": 4262 }, { "epoch": 2.3, "learning_rate": 4.037536601478648e-08, "logits/chosen": -2.148012161254883, "logits/rejected": -2.3381118774414062, "logps/chosen": -2.876094102859497, "logps/rejected": -4.943384170532227, "loss": 0.7329, "rewards/accuracies": 0.0, "rewards/chosen": 1.0993536710739136, "rewards/margins": -0.0778956413269043, "rewards/rejected": 1.1772493124008179, "step": 4263 }, { "epoch": 2.3, "learning_rate": 4.035393748903161e-08, "logits/chosen": -2.1255991458892822, "logits/rejected": -2.3306400775909424, "logps/chosen": -3.5660250186920166, "logps/rejected": -3.567047595977783, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.36812132596969604, "rewards/margins": 0.02469882369041443, "rewards/rejected": 0.3434225022792816, "step": 4264 }, { "epoch": 2.3, "learning_rate": 4.033251080332701e-08, "logits/chosen": -2.091914415359497, "logits/rejected": -2.091057777404785, "logps/chosen": -3.076744556427002, "logps/rejected": -12.703989028930664, "loss": 0.3396, "rewards/accuracies": 1.0, "rewards/chosen": 1.0933793783187866, "rewards/margins": 0.9054393768310547, "rewards/rejected": 0.18794003129005432, "step": 4265 }, { "epoch": 2.3, "learning_rate": 4.031108596175998e-08, "logits/chosen": -2.1010053157806396, "logits/rejected": -2.10201358795166, "logps/chosen": -1.7793396711349487, "logps/rejected": -6.505090236663818, "loss": 0.3469, "rewards/accuracies": 1.0, "rewards/chosen": 1.5238025188446045, "rewards/margins": 0.8803819417953491, "rewards/rejected": 0.6434205770492554, "step": 4266 }, { "epoch": 2.3, "learning_rate": 4.028966296841743e-08, "logits/chosen": -2.0472426414489746, "logits/rejected": -2.338704824447632, "logps/chosen": -1.397986650466919, "logps/rejected": -7.30102014541626, "loss": 0.5268, "rewards/accuracies": 1.0, "rewards/chosen": 1.262331247329712, "rewards/margins": 0.3659055829048157, "rewards/rejected": 0.8964256644248962, "step": 4267 }, { "epoch": 2.3, "learning_rate": 4.026824182738596e-08, "logits/chosen": -2.0089049339294434, "logits/rejected": -2.276198387145996, "logps/chosen": -0.7271683812141418, "logps/rejected": -0.7456152439117432, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 1.0231424570083618, "rewards/margins": -0.004456043243408203, "rewards/rejected": 1.02759850025177, "step": 4268 }, { "epoch": 2.3, "learning_rate": 4.024682254275176e-08, "logits/chosen": -2.0489065647125244, "logits/rejected": -2.056246757507324, "logps/chosen": -1.3138428926467896, "logps/rejected": -4.143194675445557, "loss": 0.4373, "rewards/accuracies": 1.0, "rewards/chosen": 1.049663782119751, "rewards/margins": 0.6006479263305664, "rewards/rejected": 0.44901585578918457, "step": 4269 }, { "epoch": 2.3, "learning_rate": 4.0225405118600744e-08, "logits/chosen": -2.027587652206421, "logits/rejected": -2.279301881790161, "logps/chosen": -0.23853154480457306, "logps/rejected": -0.2532356083393097, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 1.0262196063995361, "rewards/margins": 0.05028080940246582, "rewards/rejected": 0.9759387969970703, "step": 4270 }, { "epoch": 2.3, "learning_rate": 4.020398955901839e-08, "logits/chosen": -2.2073299884796143, "logits/rejected": -2.3766579627990723, "logps/chosen": -14.861034393310547, "logps/rejected": -16.034496307373047, "loss": 0.5678, "rewards/accuracies": 1.0, "rewards/chosen": 1.293447494506836, "rewards/margins": 0.2686946392059326, "rewards/rejected": 1.0247528553009033, "step": 4271 }, { "epoch": 2.3, "learning_rate": 4.018257586808989e-08, "logits/chosen": -2.2298905849456787, "logits/rejected": -2.0699121952056885, "logps/chosen": -34.014556884765625, "logps/rejected": -4.410741329193115, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 2.903470277786255, "rewards/margins": 2.438184976577759, "rewards/rejected": 0.4652852714061737, "step": 4272 }, { "epoch": 2.3, "learning_rate": 4.0161164049900025e-08, "logits/chosen": -2.1462271213531494, "logits/rejected": -2.1376073360443115, "logps/chosen": -2.961860179901123, "logps/rejected": -5.127432823181152, "loss": 0.4866, "rewards/accuracies": 1.0, "rewards/chosen": 1.4057248830795288, "rewards/margins": 0.46716541051864624, "rewards/rejected": 0.9385594725608826, "step": 4273 }, { "epoch": 2.31, "learning_rate": 4.0139754108533245e-08, "logits/chosen": -1.9762811660766602, "logits/rejected": -1.9848170280456543, "logps/chosen": -1.8857386112213135, "logps/rejected": -4.330851078033447, "loss": 0.3913, "rewards/accuracies": 1.0, "rewards/chosen": 1.1755558252334595, "rewards/margins": 0.7362493276596069, "rewards/rejected": 0.43930649757385254, "step": 4274 }, { "epoch": 2.31, "learning_rate": 4.0118346048073646e-08, "logits/chosen": -1.9991393089294434, "logits/rejected": -2.0007436275482178, "logps/chosen": -0.9039818048477173, "logps/rejected": -6.073129653930664, "loss": 0.5211, "rewards/accuracies": 1.0, "rewards/chosen": 0.818598747253418, "rewards/margins": 0.3800621032714844, "rewards/rejected": 0.4385366439819336, "step": 4275 }, { "epoch": 2.31, "learning_rate": 4.0096939872604953e-08, "logits/chosen": -2.1396026611328125, "logits/rejected": -2.1539337635040283, "logps/chosen": -0.36874109506607056, "logps/rejected": -9.249096870422363, "loss": 0.4567, "rewards/accuracies": 1.0, "rewards/chosen": 0.976226270198822, "rewards/margins": 0.5466722249984741, "rewards/rejected": 0.4295540750026703, "step": 4276 }, { "epoch": 2.31, "learning_rate": 4.007553558621053e-08, "logits/chosen": -2.251678228378296, "logits/rejected": -2.3108713626861572, "logps/chosen": -7.971006870269775, "logps/rejected": -25.81521987915039, "loss": 0.4534, "rewards/accuracies": 1.0, "rewards/chosen": 1.1980581283569336, "rewards/margins": 0.5558317303657532, "rewards/rejected": 0.6422263979911804, "step": 4277 }, { "epoch": 2.31, "learning_rate": 4.00541331929734e-08, "logits/chosen": -2.1290228366851807, "logits/rejected": -2.061357259750366, "logps/chosen": -24.143714904785156, "logps/rejected": -2.2970199584960938, "loss": 0.1731, "rewards/accuracies": 1.0, "rewards/chosen": 2.450115919113159, "rewards/margins": 1.6662869453430176, "rewards/rejected": 0.7838289141654968, "step": 4278 }, { "epoch": 2.31, "learning_rate": 4.003273269697618e-08, "logits/chosen": -2.071145534515381, "logits/rejected": -2.2596237659454346, "logps/chosen": -0.24531009793281555, "logps/rejected": -0.3146585524082184, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.8590480089187622, "rewards/margins": 0.01490795612335205, "rewards/rejected": 0.8441400527954102, "step": 4279 }, { "epoch": 2.31, "learning_rate": 4.0011334102301186e-08, "logits/chosen": -2.140565872192383, "logits/rejected": -2.353534698486328, "logps/chosen": -0.5753520131111145, "logps/rejected": -0.5325836539268494, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.7481666207313538, "rewards/margins": 0.02699202299118042, "rewards/rejected": 0.7211745977401733, "step": 4280 }, { "epoch": 2.31, "learning_rate": 3.998993741303034e-08, "logits/chosen": -2.084948778152466, "logits/rejected": -2.0924627780914307, "logps/chosen": -4.910558700561523, "logps/rejected": -3.460503339767456, "loss": 0.4839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0647177696228027, "rewards/margins": 0.47414737939834595, "rewards/rejected": 0.5905703902244568, "step": 4281 }, { "epoch": 2.31, "learning_rate": 3.99685426332452e-08, "logits/chosen": -2.0657589435577393, "logits/rejected": -2.3164007663726807, "logps/chosen": -0.33808431029319763, "logps/rejected": -0.29159098863601685, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.8376376032829285, "rewards/margins": 0.02058577537536621, "rewards/rejected": 0.8170518279075623, "step": 4282 }, { "epoch": 2.31, "learning_rate": 3.994714976702695e-08, "logits/chosen": -2.066194534301758, "logits/rejected": -2.06362247467041, "logps/chosen": -1.9798409938812256, "logps/rejected": -6.37553071975708, "loss": 0.2514, "rewards/accuracies": 1.0, "rewards/chosen": 1.5558305978775024, "rewards/margins": 1.2525134086608887, "rewards/rejected": 0.30331721901893616, "step": 4283 }, { "epoch": 2.31, "learning_rate": 3.9925758818456436e-08, "logits/chosen": -2.081143856048584, "logits/rejected": -2.334465980529785, "logps/chosen": -12.810393333435059, "logps/rejected": -15.357336044311523, "loss": 0.5655, "rewards/accuracies": 1.0, "rewards/chosen": 1.0189417600631714, "rewards/margins": 0.27409827709198, "rewards/rejected": 0.7448434829711914, "step": 4284 }, { "epoch": 2.31, "learning_rate": 3.990436979161411e-08, "logits/chosen": -2.0465548038482666, "logits/rejected": -2.043684482574463, "logps/chosen": -1.3144304752349854, "logps/rejected": -5.749052047729492, "loss": 0.422, "rewards/accuracies": 1.0, "rewards/chosen": 0.9076420068740845, "rewards/margins": 0.6443983316421509, "rewards/rejected": 0.2632436752319336, "step": 4285 }, { "epoch": 2.31, "learning_rate": 3.988298269058009e-08, "logits/chosen": -2.2138445377349854, "logits/rejected": -2.215811014175415, "logps/chosen": -3.3706793785095215, "logps/rejected": -0.9624289870262146, "loss": 0.592, "rewards/accuracies": 1.0, "rewards/chosen": 1.1449823379516602, "rewards/margins": 0.21378690004348755, "rewards/rejected": 0.9311954379081726, "step": 4286 }, { "epoch": 2.31, "learning_rate": 3.986159751943408e-08, "logits/chosen": -2.0334696769714355, "logits/rejected": -2.3342268466949463, "logps/chosen": -0.7637091279029846, "logps/rejected": -0.6870211958885193, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 1.0224379301071167, "rewards/margins": -0.021839261054992676, "rewards/rejected": 1.0442771911621094, "step": 4287 }, { "epoch": 2.31, "learning_rate": 3.984021428225546e-08, "logits/chosen": -2.119239091873169, "logits/rejected": -2.1219213008880615, "logps/chosen": -3.1353983879089355, "logps/rejected": -4.4656453132629395, "loss": 0.4234, "rewards/accuracies": 1.0, "rewards/chosen": 1.0612154006958008, "rewards/margins": 0.6402839422225952, "rewards/rejected": 0.42093148827552795, "step": 4288 }, { "epoch": 2.31, "learning_rate": 3.981883298312321e-08, "logits/chosen": -2.0503482818603516, "logits/rejected": -2.2688496112823486, "logps/chosen": -3.3432159423828125, "logps/rejected": -3.039156675338745, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8172840476036072, "rewards/margins": 0.008665859699249268, "rewards/rejected": 0.8086181879043579, "step": 4289 }, { "epoch": 2.31, "learning_rate": 3.979745362611597e-08, "logits/chosen": -2.120553493499756, "logits/rejected": -2.191632032394409, "logps/chosen": -14.577783584594727, "logps/rejected": -11.82776165008545, "loss": 0.4826, "rewards/accuracies": 1.0, "rewards/chosen": 1.543320894241333, "rewards/margins": 0.4775996208190918, "rewards/rejected": 1.0657212734222412, "step": 4290 }, { "epoch": 2.31, "learning_rate": 3.977607621531197e-08, "logits/chosen": -2.031268835067749, "logits/rejected": -2.0250821113586426, "logps/chosen": -4.171450614929199, "logps/rejected": -4.03596305847168, "loss": 0.2407, "rewards/accuracies": 1.0, "rewards/chosen": 1.647237777709961, "rewards/margins": 1.3015385866165161, "rewards/rejected": 0.34569916129112244, "step": 4291 }, { "epoch": 2.31, "learning_rate": 3.975470075478915e-08, "logits/chosen": -2.028292655944824, "logits/rejected": -2.0376620292663574, "logps/chosen": -1.2354949712753296, "logps/rejected": -3.3509256839752197, "loss": 0.389, "rewards/accuracies": 1.0, "rewards/chosen": 1.311513066291809, "rewards/margins": 0.7434844374656677, "rewards/rejected": 0.5680286288261414, "step": 4292 }, { "epoch": 2.32, "learning_rate": 3.973332724862496e-08, "logits/chosen": -2.0780229568481445, "logits/rejected": -2.0775249004364014, "logps/chosen": -2.041858434677124, "logps/rejected": -4.80332088470459, "loss": 0.3057, "rewards/accuracies": 1.0, "rewards/chosen": 1.5064386129379272, "rewards/margins": 1.0285935401916504, "rewards/rejected": 0.4778450131416321, "step": 4293 }, { "epoch": 2.32, "learning_rate": 3.971195570089656e-08, "logits/chosen": -2.1423659324645996, "logits/rejected": -2.152705430984497, "logps/chosen": -1.5787527561187744, "logps/rejected": -2.853728771209717, "loss": 0.5119, "rewards/accuracies": 1.0, "rewards/chosen": 1.021452784538269, "rewards/margins": 0.4027353525161743, "rewards/rejected": 0.6187174320220947, "step": 4294 }, { "epoch": 2.32, "learning_rate": 3.969058611568072e-08, "logits/chosen": -2.2617874145507812, "logits/rejected": -2.1772983074188232, "logps/chosen": -22.693294525146484, "logps/rejected": -4.9170331954956055, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": 2.1407477855682373, "rewards/margins": 1.6771150827407837, "rewards/rejected": 0.4636326730251312, "step": 4295 }, { "epoch": 2.32, "learning_rate": 3.966921849705382e-08, "logits/chosen": -2.041167736053467, "logits/rejected": -2.0422792434692383, "logps/chosen": -0.5266883969306946, "logps/rejected": -5.44132137298584, "loss": 0.4864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8983730673789978, "rewards/margins": 0.46770355105400085, "rewards/rejected": 0.43066951632499695, "step": 4296 }, { "epoch": 2.32, "learning_rate": 3.964785284909188e-08, "logits/chosen": -2.1158230304718018, "logits/rejected": -2.124798536300659, "logps/chosen": -1.6657989025115967, "logps/rejected": -2.8964180946350098, "loss": 0.4576, "rewards/accuracies": 1.0, "rewards/chosen": 1.216977834701538, "rewards/margins": 0.5442852973937988, "rewards/rejected": 0.6726925373077393, "step": 4297 }, { "epoch": 2.32, "learning_rate": 3.962648917587053e-08, "logits/chosen": -1.9805623292922974, "logits/rejected": -2.280029773712158, "logps/chosen": -0.21031741797924042, "logps/rejected": -0.24131181836128235, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.9623191952705383, "rewards/margins": 0.00694197416305542, "rewards/rejected": 0.9553772211074829, "step": 4298 }, { "epoch": 2.32, "learning_rate": 3.960512748146504e-08, "logits/chosen": -2.0484437942504883, "logits/rejected": -2.258035659790039, "logps/chosen": -0.23111703991889954, "logps/rejected": -0.24467948079109192, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.9095711708068848, "rewards/margins": 0.034712016582489014, "rewards/rejected": 0.8748591542243958, "step": 4299 }, { "epoch": 2.32, "learning_rate": 3.95837677699503e-08, "logits/chosen": -2.1784827709198, "logits/rejected": -2.323485851287842, "logps/chosen": -5.286755561828613, "logps/rejected": -1.2822816371917725, "loss": 0.75, "rewards/accuracies": 0.0, "rewards/chosen": 0.9552664756774902, "rewards/margins": -0.1106879711151123, "rewards/rejected": 1.0659544467926025, "step": 4300 }, { "epoch": 2.32, "learning_rate": 3.956241004540081e-08, "logits/chosen": -2.1234591007232666, "logits/rejected": -2.1388566493988037, "logps/chosen": -1.5155256986618042, "logps/rejected": -6.150949001312256, "loss": 0.313, "rewards/accuracies": 1.0, "rewards/chosen": 1.4243611097335815, "rewards/margins": 1.0011037588119507, "rewards/rejected": 0.42325732111930847, "step": 4301 }, { "epoch": 2.32, "learning_rate": 3.95410543118907e-08, "logits/chosen": -1.9927374124526978, "logits/rejected": -1.982276201248169, "logps/chosen": -8.970931053161621, "logps/rejected": -9.850141525268555, "loss": 0.502, "rewards/accuracies": 1.0, "rewards/chosen": 1.5529073476791382, "rewards/margins": 0.4278002977371216, "rewards/rejected": 1.1251070499420166, "step": 4302 }, { "epoch": 2.32, "learning_rate": 3.95197005734937e-08, "logits/chosen": -1.9366827011108398, "logits/rejected": -2.256732702255249, "logps/chosen": -3.751530170440674, "logps/rejected": -3.492504119873047, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.7353779673576355, "rewards/margins": -0.0028766989707946777, "rewards/rejected": 0.7382546663284302, "step": 4303 }, { "epoch": 2.32, "learning_rate": 3.94983488342832e-08, "logits/chosen": -2.0589826107025146, "logits/rejected": -2.257166624069214, "logps/chosen": -1.3865087032318115, "logps/rejected": -1.7283320426940918, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 1.0616366863250732, "rewards/margins": 0.0321732759475708, "rewards/rejected": 1.0294634103775024, "step": 4304 }, { "epoch": 2.32, "learning_rate": 3.9476999098332174e-08, "logits/chosen": -2.1609489917755127, "logits/rejected": -2.0498616695404053, "logps/chosen": -20.917083740234375, "logps/rejected": -4.2384138107299805, "loss": 0.1199, "rewards/accuracies": 1.0, "rewards/chosen": 2.560190200805664, "rewards/margins": 2.0603747367858887, "rewards/rejected": 0.4998153746128082, "step": 4305 }, { "epoch": 2.32, "learning_rate": 3.9455651369713236e-08, "logits/chosen": -2.1597201824188232, "logits/rejected": -2.042402744293213, "logps/chosen": -9.112138748168945, "logps/rejected": -2.2005133628845215, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 1.7766163349151611, "rewards/margins": 0.9827790856361389, "rewards/rejected": 0.7938372492790222, "step": 4306 }, { "epoch": 2.32, "learning_rate": 3.94343056524986e-08, "logits/chosen": -2.215820074081421, "logits/rejected": -2.1037678718566895, "logps/chosen": -16.816139221191406, "logps/rejected": -2.385939836502075, "loss": 0.1852, "rewards/accuracies": 1.0, "rewards/chosen": 2.3119804859161377, "rewards/margins": 1.5922093391418457, "rewards/rejected": 0.7197712063789368, "step": 4307 }, { "epoch": 2.32, "learning_rate": 3.941296195076011e-08, "logits/chosen": -2.053662061691284, "logits/rejected": -2.0546658039093018, "logps/chosen": -2.2734193801879883, "logps/rejected": -0.666642427444458, "loss": 0.6321, "rewards/accuracies": 1.0, "rewards/chosen": 0.8571317791938782, "rewards/margins": 0.12613248825073242, "rewards/rejected": 0.7309992909431458, "step": 4308 }, { "epoch": 2.32, "learning_rate": 3.9391620268569204e-08, "logits/chosen": -2.127974271774292, "logits/rejected": -2.2981083393096924, "logps/chosen": -1.5394514799118042, "logps/rejected": -1.4926985502243042, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.9876835942268372, "rewards/margins": 0.028083324432373047, "rewards/rejected": 0.9596002697944641, "step": 4309 }, { "epoch": 2.32, "learning_rate": 3.9370280609996955e-08, "logits/chosen": -2.189833879470825, "logits/rejected": -2.152576446533203, "logps/chosen": -5.616009712219238, "logps/rejected": -16.074485778808594, "loss": 0.334, "rewards/accuracies": 1.0, "rewards/chosen": 1.2867705821990967, "rewards/margins": 0.9251036643981934, "rewards/rejected": 0.36166688799858093, "step": 4310 }, { "epoch": 2.33, "learning_rate": 3.934894297911404e-08, "logits/chosen": -2.1313998699188232, "logits/rejected": -2.1325795650482178, "logps/chosen": -2.8806331157684326, "logps/rejected": -7.986869812011719, "loss": 0.4621, "rewards/accuracies": 1.0, "rewards/chosen": 1.3326290845870972, "rewards/margins": 0.5320190787315369, "rewards/rejected": 0.8006100058555603, "step": 4311 }, { "epoch": 2.33, "learning_rate": 3.932760737999076e-08, "logits/chosen": -2.124929904937744, "logits/rejected": -2.237985372543335, "logps/chosen": -7.654948711395264, "logps/rejected": -20.815078735351562, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/chosen": 2.0234386920928955, "rewards/margins": 1.315901517868042, "rewards/rejected": 0.7075371146202087, "step": 4312 }, { "epoch": 2.33, "learning_rate": 3.9306273816697024e-08, "logits/chosen": -2.1546859741210938, "logits/rejected": -2.159059524536133, "logps/chosen": -3.8218183517456055, "logps/rejected": -10.320647239685059, "loss": 0.3213, "rewards/accuracies": 1.0, "rewards/chosen": 1.245964527130127, "rewards/margins": 0.9703012704849243, "rewards/rejected": 0.275663286447525, "step": 4313 }, { "epoch": 2.33, "learning_rate": 3.928494229330232e-08, "logits/chosen": -2.0593109130859375, "logits/rejected": -2.2966480255126953, "logps/chosen": -1.1787238121032715, "logps/rejected": -1.270451307296753, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.7893043160438538, "rewards/margins": 0.0385739803314209, "rewards/rejected": 0.7507303357124329, "step": 4314 }, { "epoch": 2.33, "learning_rate": 3.926361281387583e-08, "logits/chosen": -2.11071515083313, "logits/rejected": -2.326428174972534, "logps/chosen": -0.11328292638063431, "logps/rejected": -0.1216258853673935, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 1.0427299737930298, "rewards/margins": 0.017803430557250977, "rewards/rejected": 1.0249265432357788, "step": 4315 }, { "epoch": 2.33, "learning_rate": 3.9242285382486274e-08, "logits/chosen": -1.9800862073898315, "logits/rejected": -1.980352759361267, "logps/chosen": -0.8473417162895203, "logps/rejected": -2.2847368717193604, "loss": 0.5321, "rewards/accuracies": 1.0, "rewards/chosen": 1.045186161994934, "rewards/margins": 0.3530200123786926, "rewards/rejected": 0.6921661496162415, "step": 4316 }, { "epoch": 2.33, "learning_rate": 3.922096000320199e-08, "logits/chosen": -2.068866729736328, "logits/rejected": -2.141295909881592, "logps/chosen": -1.7607336044311523, "logps/rejected": -18.349773406982422, "loss": 0.5683, "rewards/accuracies": 1.0, "rewards/chosen": 1.403471827507019, "rewards/margins": 0.267549991607666, "rewards/rejected": 1.135921835899353, "step": 4317 }, { "epoch": 2.33, "learning_rate": 3.919963668009093e-08, "logits/chosen": -2.0534582138061523, "logits/rejected": -2.0453498363494873, "logps/chosen": -3.3053388595581055, "logps/rejected": -7.7992963790893555, "loss": 0.2549, "rewards/accuracies": 1.0, "rewards/chosen": 1.634172797203064, "rewards/margins": 1.2369272708892822, "rewards/rejected": 0.39724549651145935, "step": 4318 }, { "epoch": 2.33, "learning_rate": 3.9178315417220676e-08, "logits/chosen": -2.100790500640869, "logits/rejected": -2.105959892272949, "logps/chosen": -1.855911135673523, "logps/rejected": -12.944252014160156, "loss": 0.5114, "rewards/accuracies": 1.0, "rewards/chosen": 1.3547734022140503, "rewards/margins": 0.40411192178726196, "rewards/rejected": 0.9506614804267883, "step": 4319 }, { "epoch": 2.33, "learning_rate": 3.915699621865838e-08, "logits/chosen": -2.0901052951812744, "logits/rejected": -2.089317798614502, "logps/chosen": -0.15909118950366974, "logps/rejected": -6.896407127380371, "loss": 0.4888, "rewards/accuracies": 1.0, "rewards/chosen": 1.041563868522644, "rewards/margins": 0.46145254373550415, "rewards/rejected": 0.5801113247871399, "step": 4320 }, { "epoch": 2.33, "learning_rate": 3.9135679088470845e-08, "logits/chosen": -2.1494133472442627, "logits/rejected": -2.134355068206787, "logps/chosen": -7.587198734283447, "logps/rejected": -6.889497756958008, "loss": 0.321, "rewards/accuracies": 1.0, "rewards/chosen": 1.4818884134292603, "rewards/margins": 0.9716350436210632, "rewards/rejected": 0.510253369808197, "step": 4321 }, { "epoch": 2.33, "learning_rate": 3.911436403072444e-08, "logits/chosen": -2.0119693279266357, "logits/rejected": -2.3101205825805664, "logps/chosen": -0.40062350034713745, "logps/rejected": -0.42278820276260376, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9537240862846375, "rewards/margins": 0.012375712394714355, "rewards/rejected": 0.9413483738899231, "step": 4322 }, { "epoch": 2.33, "learning_rate": 3.909305104948515e-08, "logits/chosen": -1.986385703086853, "logits/rejected": -2.2798800468444824, "logps/chosen": -2.218717098236084, "logps/rejected": -2.290809154510498, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 1.0451011657714844, "rewards/margins": 0.01641666889190674, "rewards/rejected": 1.0286844968795776, "step": 4323 }, { "epoch": 2.33, "learning_rate": 3.907174014881858e-08, "logits/chosen": -2.0709595680236816, "logits/rejected": -2.078371286392212, "logps/chosen": -0.9444312453269958, "logps/rejected": -12.46799087524414, "loss": 0.5407, "rewards/accuracies": 1.0, "rewards/chosen": 1.1326977014541626, "rewards/margins": 0.33241409063339233, "rewards/rejected": 0.8002836108207703, "step": 4324 }, { "epoch": 2.33, "learning_rate": 3.90504313327899e-08, "logits/chosen": -2.155529022216797, "logits/rejected": -2.157257556915283, "logps/chosen": -6.966423034667969, "logps/rejected": -6.6957197189331055, "loss": 0.5792, "rewards/accuracies": 1.0, "rewards/chosen": 1.3739944696426392, "rewards/margins": 0.2425605058670044, "rewards/rejected": 1.1314339637756348, "step": 4325 }, { "epoch": 2.33, "learning_rate": 3.9029124605463946e-08, "logits/chosen": -2.046064853668213, "logits/rejected": -2.257725715637207, "logps/chosen": -2.3832521438598633, "logps/rejected": -0.5395166873931885, "loss": 0.7288, "rewards/accuracies": 0.0, "rewards/chosen": 1.0034434795379639, "rewards/margins": -0.07013189792633057, "rewards/rejected": 1.0735753774642944, "step": 4326 }, { "epoch": 2.33, "learning_rate": 3.900781997090509e-08, "logits/chosen": -2.190351963043213, "logits/rejected": -2.2787301540374756, "logps/chosen": -3.850938558578491, "logps/rejected": -5.633474826812744, "loss": 0.6684, "rewards/accuracies": 1.0, "rewards/chosen": 1.0096073150634766, "rewards/margins": 0.05017334222793579, "rewards/rejected": 0.9594339728355408, "step": 4327 }, { "epoch": 2.33, "learning_rate": 3.8986517433177346e-08, "logits/chosen": -1.973671793937683, "logits/rejected": -2.272203207015991, "logps/chosen": -2.1320204734802246, "logps/rejected": -1.732185959815979, "loss": 0.706, "rewards/accuracies": 0.0, "rewards/chosen": 0.6966613531112671, "rewards/margins": -0.025577902793884277, "rewards/rejected": 0.7222392559051514, "step": 4328 }, { "epoch": 2.33, "learning_rate": 3.89652169963443e-08, "logits/chosen": -2.0761284828186035, "logits/rejected": -2.2845818996429443, "logps/chosen": -3.9082224369049072, "logps/rejected": -3.0711469650268555, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.8724355101585388, "rewards/margins": 0.03920567035675049, "rewards/rejected": 0.8332298398017883, "step": 4329 }, { "epoch": 2.34, "learning_rate": 3.8943918664469165e-08, "logits/chosen": -2.034656524658203, "logits/rejected": -2.041771650314331, "logps/chosen": -1.9004278182983398, "logps/rejected": -4.934509754180908, "loss": 0.423, "rewards/accuracies": 1.0, "rewards/chosen": 1.119218349456787, "rewards/margins": 0.6413940787315369, "rewards/rejected": 0.47782427072525024, "step": 4330 }, { "epoch": 2.34, "learning_rate": 3.8922622441614725e-08, "logits/chosen": -2.1252338886260986, "logits/rejected": -2.2518484592437744, "logps/chosen": -0.5944727063179016, "logps/rejected": -0.6350532174110413, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 1.029839277267456, "rewards/margins": 0.00730586051940918, "rewards/rejected": 1.0225334167480469, "step": 4331 }, { "epoch": 2.34, "learning_rate": 3.890132833184339e-08, "logits/chosen": -2.042501211166382, "logits/rejected": -2.0979409217834473, "logps/chosen": -2.8792266845703125, "logps/rejected": -23.419031143188477, "loss": 0.3672, "rewards/accuracies": 1.0, "rewards/chosen": 1.098518967628479, "rewards/margins": 0.8125156164169312, "rewards/rejected": 0.28600332140922546, "step": 4332 }, { "epoch": 2.34, "learning_rate": 3.8880036339217154e-08, "logits/chosen": -2.0753331184387207, "logits/rejected": -2.289613723754883, "logps/chosen": -7.144402980804443, "logps/rejected": -8.974930763244629, "loss": 0.5951, "rewards/accuracies": 1.0, "rewards/chosen": 0.9724161028862, "rewards/margins": 0.20686215162277222, "rewards/rejected": 0.7655539512634277, "step": 4333 }, { "epoch": 2.34, "learning_rate": 3.8858746467797585e-08, "logits/chosen": -2.1067700386047363, "logits/rejected": -2.339226722717285, "logps/chosen": -6.0313215255737305, "logps/rejected": -0.24511629343032837, "loss": 0.573, "rewards/accuracies": 1.0, "rewards/chosen": 1.2472586631774902, "rewards/margins": 0.256639301776886, "rewards/rejected": 0.9906193614006042, "step": 4334 }, { "epoch": 2.34, "learning_rate": 3.883745872164588e-08, "logits/chosen": -2.019059658050537, "logits/rejected": -2.2860195636749268, "logps/chosen": -0.6831566691398621, "logps/rejected": -0.7884246706962585, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 1.0456503629684448, "rewards/margins": 0.009103894233703613, "rewards/rejected": 1.0365464687347412, "step": 4335 }, { "epoch": 2.34, "learning_rate": 3.881617310482279e-08, "logits/chosen": -2.0024404525756836, "logits/rejected": -2.289247512817383, "logps/chosen": -7.985792636871338, "logps/rejected": -5.983517646789551, "loss": 0.7385, "rewards/accuracies": 0.0, "rewards/chosen": 1.0422537326812744, "rewards/margins": -0.08874523639678955, "rewards/rejected": 1.130998969078064, "step": 4336 }, { "epoch": 2.34, "learning_rate": 3.879488962138874e-08, "logits/chosen": -2.060408353805542, "logits/rejected": -2.277193069458008, "logps/chosen": -0.10117974132299423, "logps/rejected": -0.07917177677154541, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.8665744662284851, "rewards/margins": 0.023795604705810547, "rewards/rejected": 0.8427788615226746, "step": 4337 }, { "epoch": 2.34, "learning_rate": 3.877360827540367e-08, "logits/chosen": -2.086507797241211, "logits/rejected": -2.086040735244751, "logps/chosen": -3.0093300342559814, "logps/rejected": -3.996995210647583, "loss": 0.5616, "rewards/accuracies": 1.0, "rewards/chosen": 0.8334302306175232, "rewards/margins": 0.2830166220664978, "rewards/rejected": 0.5504136085510254, "step": 4338 }, { "epoch": 2.34, "learning_rate": 3.875232907092715e-08, "logits/chosen": -2.1091675758361816, "logits/rejected": -2.300863027572632, "logps/chosen": -0.3456917405128479, "logps/rejected": -0.35779064893722534, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.9353480339050293, "rewards/margins": 0.024806559085845947, "rewards/rejected": 0.9105414748191833, "step": 4339 }, { "epoch": 2.34, "learning_rate": 3.8731052012018294e-08, "logits/chosen": -2.0095155239105225, "logits/rejected": -2.008737087249756, "logps/chosen": -0.5440642237663269, "logps/rejected": -1.97210693359375, "loss": 0.572, "rewards/accuracies": 1.0, "rewards/chosen": 1.0520418882369995, "rewards/margins": 0.2590402364730835, "rewards/rejected": 0.793001651763916, "step": 4340 }, { "epoch": 2.34, "learning_rate": 3.870977710273588e-08, "logits/chosen": -2.2070860862731934, "logits/rejected": -2.21577787399292, "logps/chosen": -1.275775671005249, "logps/rejected": -5.9425153732299805, "loss": 0.2852, "rewards/accuracies": 1.0, "rewards/chosen": 1.3917917013168335, "rewards/margins": 1.10856294631958, "rewards/rejected": 0.28322869539260864, "step": 4341 }, { "epoch": 2.34, "learning_rate": 3.8688504347138206e-08, "logits/chosen": -2.114535093307495, "logits/rejected": -2.2705018520355225, "logps/chosen": -0.14922887086868286, "logps/rejected": -0.14061179757118225, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.953205406665802, "rewards/margins": 0.0333782434463501, "rewards/rejected": 0.9198271632194519, "step": 4342 }, { "epoch": 2.34, "learning_rate": 3.866723374928321e-08, "logits/chosen": -2.1540768146514893, "logits/rejected": -2.313826322555542, "logps/chosen": -0.2373654842376709, "logps/rejected": -6.8048930168151855, "loss": 0.5931, "rewards/accuracies": 1.0, "rewards/chosen": 0.9525278210639954, "rewards/margins": 0.21125131845474243, "rewards/rejected": 0.7412765026092529, "step": 4343 }, { "epoch": 2.34, "learning_rate": 3.86459653132284e-08, "logits/chosen": -2.3465781211853027, "logits/rejected": -2.2226409912109375, "logps/chosen": -28.603702545166016, "logps/rejected": -2.8869171142578125, "loss": 0.1946, "rewards/accuracies": 1.0, "rewards/chosen": 2.2927067279815674, "rewards/margins": 1.5380451679229736, "rewards/rejected": 0.754661500453949, "step": 4344 }, { "epoch": 2.34, "learning_rate": 3.862469904303086e-08, "logits/chosen": -2.127030372619629, "logits/rejected": -2.2829291820526123, "logps/chosen": -0.4075489640235901, "logps/rejected": -0.3797681927680969, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 1.0098024606704712, "rewards/margins": 0.02380305528640747, "rewards/rejected": 0.9859994053840637, "step": 4345 }, { "epoch": 2.34, "learning_rate": 3.8603434942747284e-08, "logits/chosen": -2.1501996517181396, "logits/rejected": -2.3965256214141846, "logps/chosen": -9.053789138793945, "logps/rejected": -12.72573184967041, "loss": 0.7224, "rewards/accuracies": 0.0, "rewards/chosen": 1.0519647598266602, "rewards/margins": -0.05764174461364746, "rewards/rejected": 1.1096065044403076, "step": 4346 }, { "epoch": 2.34, "learning_rate": 3.858217301643393e-08, "logits/chosen": -2.0235581398010254, "logits/rejected": -2.028822183609009, "logps/chosen": -1.5177299976348877, "logps/rejected": -4.341804027557373, "loss": 0.4604, "rewards/accuracies": 1.0, "rewards/chosen": 0.9914858937263489, "rewards/margins": 0.5366564989089966, "rewards/rejected": 0.4548293650150299, "step": 4347 }, { "epoch": 2.35, "learning_rate": 3.8560913268146655e-08, "logits/chosen": -2.0847113132476807, "logits/rejected": -2.282116413116455, "logps/chosen": -2.8016669750213623, "logps/rejected": -5.92702054977417, "loss": 0.6451, "rewards/accuracies": 1.0, "rewards/chosen": 0.608191192150116, "rewards/margins": 0.09852564334869385, "rewards/rejected": 0.5096655488014221, "step": 4348 }, { "epoch": 2.35, "learning_rate": 3.85396557019409e-08, "logits/chosen": -2.1002869606018066, "logits/rejected": -2.2303996086120605, "logps/chosen": -0.7226964831352234, "logps/rejected": -0.8241429924964905, "loss": 0.6697, "rewards/accuracies": 1.0, "rewards/chosen": 0.7605385780334473, "rewards/margins": 0.047443270683288574, "rewards/rejected": 0.7130953073501587, "step": 4349 }, { "epoch": 2.35, "learning_rate": 3.851840032187168e-08, "logits/chosen": -2.118191719055176, "logits/rejected": -2.336937665939331, "logps/chosen": -2.113285541534424, "logps/rejected": -6.2432942390441895, "loss": 0.6489, "rewards/accuracies": 1.0, "rewards/chosen": 1.0188645124435425, "rewards/margins": 0.0904662013053894, "rewards/rejected": 0.9283983111381531, "step": 4350 }, { "epoch": 2.35, "learning_rate": 3.8497147131993615e-08, "logits/chosen": -2.0868842601776123, "logits/rejected": -2.3301608562469482, "logps/chosen": -0.226321280002594, "logps/rejected": -0.2804463803768158, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9098753929138184, "rewards/margins": 0.01777416467666626, "rewards/rejected": 0.8921012282371521, "step": 4351 }, { "epoch": 2.35, "learning_rate": 3.847589613636087e-08, "logits/chosen": -2.102022409439087, "logits/rejected": -2.366028308868408, "logps/chosen": -1.1063857078552246, "logps/rejected": -1.1897778511047363, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.7971544861793518, "rewards/margins": 0.01490795612335205, "rewards/rejected": 0.7822465300559998, "step": 4352 }, { "epoch": 2.35, "learning_rate": 3.8454647339027226e-08, "logits/chosen": -2.023775339126587, "logits/rejected": -2.0335094928741455, "logps/chosen": -1.6771037578582764, "logps/rejected": -2.665212392807007, "loss": 0.3902, "rewards/accuracies": 1.0, "rewards/chosen": 1.4453110694885254, "rewards/margins": 0.7397871017456055, "rewards/rejected": 0.7055239677429199, "step": 4353 }, { "epoch": 2.35, "learning_rate": 3.843340074404603e-08, "logits/chosen": -2.06569766998291, "logits/rejected": -2.057328939437866, "logps/chosen": -3.866102695465088, "logps/rejected": -7.253261566162109, "loss": 0.299, "rewards/accuracies": 1.0, "rewards/chosen": 1.421921968460083, "rewards/margins": 1.053918480873108, "rewards/rejected": 0.3680034577846527, "step": 4354 }, { "epoch": 2.35, "learning_rate": 3.8412156355470204e-08, "logits/chosen": -1.9494240283966064, "logits/rejected": -2.2819430828094482, "logps/chosen": -0.5934895873069763, "logps/rejected": -0.5513795018196106, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.8754990696907043, "rewards/margins": 0.013351798057556152, "rewards/rejected": 0.8621472716331482, "step": 4355 }, { "epoch": 2.35, "learning_rate": 3.8390914177352276e-08, "logits/chosen": -2.045994520187378, "logits/rejected": -2.27126145362854, "logps/chosen": -1.0381755828857422, "logps/rejected": -0.9842715263366699, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.9052998423576355, "rewards/margins": 0.02805924415588379, "rewards/rejected": 0.8772405982017517, "step": 4356 }, { "epoch": 2.35, "learning_rate": 3.8369674213744305e-08, "logits/chosen": -2.0689213275909424, "logits/rejected": -2.0716981887817383, "logps/chosen": -4.217079162597656, "logps/rejected": -1.7497367858886719, "loss": 0.4702, "rewards/accuracies": 1.0, "rewards/chosen": 1.4399051666259766, "rewards/margins": 0.5103623270988464, "rewards/rejected": 0.9295428395271301, "step": 4357 }, { "epoch": 2.35, "learning_rate": 3.8348436468697966e-08, "logits/chosen": -2.0270614624023438, "logits/rejected": -2.0422847270965576, "logps/chosen": -3.3693604469299316, "logps/rejected": -8.210733413696289, "loss": 0.4438, "rewards/accuracies": 1.0, "rewards/chosen": 1.1156386137008667, "rewards/margins": 0.5821808576583862, "rewards/rejected": 0.5334577560424805, "step": 4358 }, { "epoch": 2.35, "learning_rate": 3.832720094626448e-08, "logits/chosen": -2.1759424209594727, "logits/rejected": -2.180400848388672, "logps/chosen": -2.661555767059326, "logps/rejected": -4.548896789550781, "loss": 0.3849, "rewards/accuracies": 1.0, "rewards/chosen": 1.226294755935669, "rewards/margins": 0.75628662109375, "rewards/rejected": 0.47000810503959656, "step": 4359 }, { "epoch": 2.35, "learning_rate": 3.8305967650494706e-08, "logits/chosen": -1.9560712575912476, "logits/rejected": -2.300870418548584, "logps/chosen": -0.16660483181476593, "logps/rejected": -0.2285761535167694, "loss": 0.7066, "rewards/accuracies": 0.0, "rewards/chosen": 0.9316112399101257, "rewards/margins": -0.026789963245391846, "rewards/rejected": 0.9584012031555176, "step": 4360 }, { "epoch": 2.35, "learning_rate": 3.8284736585439014e-08, "logits/chosen": -2.160135507583618, "logits/rejected": -2.022888422012329, "logps/chosen": -29.556081771850586, "logps/rejected": -2.2824811935424805, "loss": 0.1279, "rewards/accuracies": 1.0, "rewards/chosen": 2.679213523864746, "rewards/margins": 1.9915292263031006, "rewards/rejected": 0.6876843571662903, "step": 4361 }, { "epoch": 2.35, "learning_rate": 3.826350775514738e-08, "logits/chosen": -2.0964534282684326, "logits/rejected": -2.288480520248413, "logps/chosen": -2.181065559387207, "logps/rejected": -6.774929046630859, "loss": 0.5695, "rewards/accuracies": 1.0, "rewards/chosen": 0.7283956408500671, "rewards/margins": 0.26481255888938904, "rewards/rejected": 0.4635830819606781, "step": 4362 }, { "epoch": 2.35, "learning_rate": 3.8242281163669324e-08, "logits/chosen": -2.067387342453003, "logits/rejected": -2.0807814598083496, "logps/chosen": -2.0171260833740234, "logps/rejected": -2.758373975753784, "loss": 0.5024, "rewards/accuracies": 1.0, "rewards/chosen": 1.0697567462921143, "rewards/margins": 0.42678505182266235, "rewards/rejected": 0.6429716944694519, "step": 4363 }, { "epoch": 2.35, "learning_rate": 3.8221056815053966e-08, "logits/chosen": -2.1495096683502197, "logits/rejected": -2.149428129196167, "logps/chosen": -2.562103748321533, "logps/rejected": -2.3253366947174072, "loss": 0.6257, "rewards/accuracies": 1.0, "rewards/chosen": 0.9536176919937134, "rewards/margins": 0.13975566625595093, "rewards/rejected": 0.8138620257377625, "step": 4364 }, { "epoch": 2.35, "learning_rate": 3.819983471334999e-08, "logits/chosen": -2.103137254714966, "logits/rejected": -2.1060590744018555, "logps/chosen": -1.0600996017456055, "logps/rejected": -2.040201425552368, "loss": 0.5875, "rewards/accuracies": 1.0, "rewards/chosen": 1.1730917692184448, "rewards/margins": 0.22378146648406982, "rewards/rejected": 0.949310302734375, "step": 4365 }, { "epoch": 2.35, "learning_rate": 3.8178614862605665e-08, "logits/chosen": -2.021768093109131, "logits/rejected": -2.2974820137023926, "logps/chosen": -0.21371060609817505, "logps/rejected": -0.2036316990852356, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9920110106468201, "rewards/margins": 0.019203782081604004, "rewards/rejected": 0.9728072285652161, "step": 4366 }, { "epoch": 2.36, "learning_rate": 3.8157397266868794e-08, "logits/chosen": -2.082077980041504, "logits/rejected": -2.0816938877105713, "logps/chosen": -1.5916707515716553, "logps/rejected": -4.310086250305176, "loss": 0.5447, "rewards/accuracies": 1.0, "rewards/chosen": 0.7979281544685364, "rewards/margins": 0.32278212904930115, "rewards/rejected": 0.47514602541923523, "step": 4367 }, { "epoch": 2.36, "learning_rate": 3.8136181930186807e-08, "logits/chosen": -2.1870570182800293, "logits/rejected": -2.1929454803466797, "logps/chosen": -2.83200740814209, "logps/rejected": -6.55056619644165, "loss": 0.3397, "rewards/accuracies": 1.0, "rewards/chosen": 1.7572895288467407, "rewards/margins": 0.9051510095596313, "rewards/rejected": 0.8521385192871094, "step": 4368 }, { "epoch": 2.36, "learning_rate": 3.811496885660664e-08, "logits/chosen": -2.066908597946167, "logits/rejected": -2.0784201622009277, "logps/chosen": -5.44765043258667, "logps/rejected": -9.489461898803711, "loss": 0.2769, "rewards/accuracies": 1.0, "rewards/chosen": 1.7019226551055908, "rewards/margins": 1.142557144165039, "rewards/rejected": 0.559365451335907, "step": 4369 }, { "epoch": 2.36, "learning_rate": 3.809375805017483e-08, "logits/chosen": -2.182420492172241, "logits/rejected": -2.1747491359710693, "logps/chosen": -6.750072002410889, "logps/rejected": -2.7509331703186035, "loss": 0.4656, "rewards/accuracies": 1.0, "rewards/chosen": 1.2636699676513672, "rewards/margins": 0.5224989056587219, "rewards/rejected": 0.7411710619926453, "step": 4370 }, { "epoch": 2.36, "learning_rate": 3.80725495149375e-08, "logits/chosen": -2.077413320541382, "logits/rejected": -2.076625347137451, "logps/chosen": -1.0249110460281372, "logps/rejected": -3.3596765995025635, "loss": 0.5779, "rewards/accuracies": 1.0, "rewards/chosen": 1.0281646251678467, "rewards/margins": 0.24552595615386963, "rewards/rejected": 0.782638669013977, "step": 4371 }, { "epoch": 2.36, "learning_rate": 3.80513432549403e-08, "logits/chosen": -2.1357932090759277, "logits/rejected": -2.2902941703796387, "logps/chosen": -1.332517385482788, "logps/rejected": -3.2502896785736084, "loss": 0.589, "rewards/accuracies": 1.0, "rewards/chosen": 0.9222745895385742, "rewards/margins": 0.22043877840042114, "rewards/rejected": 0.7018358111381531, "step": 4372 }, { "epoch": 2.36, "learning_rate": 3.803013927422848e-08, "logits/chosen": -2.0537023544311523, "logits/rejected": -2.068284273147583, "logps/chosen": -1.3496402502059937, "logps/rejected": -6.972500801086426, "loss": 0.4372, "rewards/accuracies": 1.0, "rewards/chosen": 1.2808220386505127, "rewards/margins": 0.6008646488189697, "rewards/rejected": 0.679957389831543, "step": 4373 }, { "epoch": 2.36, "learning_rate": 3.8008937576846824e-08, "logits/chosen": -2.050286054611206, "logits/rejected": -2.280734062194824, "logps/chosen": -0.7616955041885376, "logps/rejected": -0.7465023398399353, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 1.0410470962524414, "rewards/margins": 0.016375184059143066, "rewards/rejected": 1.0246719121932983, "step": 4374 }, { "epoch": 2.36, "learning_rate": 3.79877381668397e-08, "logits/chosen": -2.025057792663574, "logits/rejected": -2.2065062522888184, "logps/chosen": -0.9378515481948853, "logps/rejected": -1.0117484331130981, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8690779805183411, "rewards/margins": 0.00969630479812622, "rewards/rejected": 0.8593816757202148, "step": 4375 }, { "epoch": 2.36, "learning_rate": 3.796654104825104e-08, "logits/chosen": -2.0148212909698486, "logits/rejected": -2.0121407508850098, "logps/chosen": -6.847288131713867, "logps/rejected": -4.497450351715088, "loss": 0.2921, "rewards/accuracies": 1.0, "rewards/chosen": 1.4855049848556519, "rewards/margins": 1.0809149742126465, "rewards/rejected": 0.404589980840683, "step": 4376 }, { "epoch": 2.36, "learning_rate": 3.794534622512434e-08, "logits/chosen": -2.1592323780059814, "logits/rejected": -2.150348424911499, "logps/chosen": -3.351996421813965, "logps/rejected": -9.154083251953125, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 1.3117998838424683, "rewards/margins": 0.8763099908828735, "rewards/rejected": 0.43548986315727234, "step": 4377 }, { "epoch": 2.36, "learning_rate": 3.792415370150264e-08, "logits/chosen": -1.9909443855285645, "logits/rejected": -2.251152753829956, "logps/chosen": -0.5357949137687683, "logps/rejected": -3.4668679237365723, "loss": 0.5313, "rewards/accuracies": 1.0, "rewards/chosen": 1.0195897817611694, "rewards/margins": 0.35516083240509033, "rewards/rejected": 0.6644289493560791, "step": 4378 }, { "epoch": 2.36, "learning_rate": 3.790296348142856e-08, "logits/chosen": -2.1484711170196533, "logits/rejected": -2.2289254665374756, "logps/chosen": -1.8247021436691284, "logps/rejected": -1.6474273204803467, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.9707979559898376, "rewards/margins": -0.006418585777282715, "rewards/rejected": 0.9772165417671204, "step": 4379 }, { "epoch": 2.36, "learning_rate": 3.788177556894429e-08, "logits/chosen": -2.196890115737915, "logits/rejected": -2.079206943511963, "logps/chosen": -35.17137908935547, "logps/rejected": -2.103663444519043, "loss": 0.1383, "rewards/accuracies": 1.0, "rewards/chosen": 2.666262149810791, "rewards/margins": 1.9086501598358154, "rewards/rejected": 0.7576119303703308, "step": 4380 }, { "epoch": 2.36, "learning_rate": 3.786058996809151e-08, "logits/chosen": -2.0793209075927734, "logits/rejected": -2.2962660789489746, "logps/chosen": -0.8478133082389832, "logps/rejected": -1.3622536659240723, "loss": 0.7222, "rewards/accuracies": 0.0, "rewards/chosen": 0.8792751431465149, "rewards/margins": -0.057266056537628174, "rewards/rejected": 0.9365411996841431, "step": 4381 }, { "epoch": 2.36, "learning_rate": 3.7839406682911575e-08, "logits/chosen": -2.025040864944458, "logits/rejected": -1.995574951171875, "logps/chosen": -7.9082818031311035, "logps/rejected": -5.716050148010254, "loss": 0.3882, "rewards/accuracies": 1.0, "rewards/chosen": 1.4689377546310425, "rewards/margins": 0.7457188963890076, "rewards/rejected": 0.7232188582420349, "step": 4382 }, { "epoch": 2.36, "learning_rate": 3.7818225717445314e-08, "logits/chosen": -2.2038626670837402, "logits/rejected": -2.2150282859802246, "logps/chosen": -2.692843198776245, "logps/rejected": -7.1593828201293945, "loss": 0.2657, "rewards/accuracies": 1.0, "rewards/chosen": 1.3463279008865356, "rewards/margins": 1.1894476413726807, "rewards/rejected": 0.15688028931617737, "step": 4383 }, { "epoch": 2.36, "learning_rate": 3.779704707573313e-08, "logits/chosen": -2.0973000526428223, "logits/rejected": -2.325402021408081, "logps/chosen": -0.07826545089483261, "logps/rejected": -0.08759067207574844, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9385207295417786, "rewards/margins": 0.008831381797790527, "rewards/rejected": 0.929689347743988, "step": 4384 }, { "epoch": 2.37, "learning_rate": 3.777587076181501e-08, "logits/chosen": -2.0475013256073, "logits/rejected": -2.0474135875701904, "logps/chosen": -0.47857561707496643, "logps/rejected": -7.86117696762085, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 1.1366932392120361, "rewards/margins": 0.8680545687675476, "rewards/rejected": 0.2686386704444885, "step": 4385 }, { "epoch": 2.37, "learning_rate": 3.775469677973047e-08, "logits/chosen": -1.9729877710342407, "logits/rejected": -1.968640923500061, "logps/chosen": -3.684357166290283, "logps/rejected": -5.989323616027832, "loss": 0.2472, "rewards/accuracies": 1.0, "rewards/chosen": 1.7109965085983276, "rewards/margins": 1.2713823318481445, "rewards/rejected": 0.43961411714553833, "step": 4386 }, { "epoch": 2.37, "learning_rate": 3.773352513351856e-08, "logits/chosen": -2.1482114791870117, "logits/rejected": -2.146322250366211, "logps/chosen": -3.334641933441162, "logps/rejected": -2.2709786891937256, "loss": 0.4992, "rewards/accuracies": 1.0, "rewards/chosen": 1.1559520959854126, "rewards/margins": 0.43480461835861206, "rewards/rejected": 0.7211474776268005, "step": 4387 }, { "epoch": 2.37, "learning_rate": 3.771235582721794e-08, "logits/chosen": -2.0011520385742188, "logits/rejected": -2.007922410964966, "logps/chosen": -1.5887616872787476, "logps/rejected": -4.85627555847168, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 0.9944775700569153, "rewards/margins": 0.6096925735473633, "rewards/rejected": 0.384784996509552, "step": 4388 }, { "epoch": 2.37, "learning_rate": 3.7691188864866786e-08, "logits/chosen": -2.244687557220459, "logits/rejected": -2.2776713371276855, "logps/chosen": -8.832557678222656, "logps/rejected": -7.013331890106201, "loss": 0.6441, "rewards/accuracies": 1.0, "rewards/chosen": 1.0316158533096313, "rewards/margins": 0.10061609745025635, "rewards/rejected": 0.930999755859375, "step": 4389 }, { "epoch": 2.37, "learning_rate": 3.767002425050283e-08, "logits/chosen": -2.1196131706237793, "logits/rejected": -2.3186185359954834, "logps/chosen": -0.4489874541759491, "logps/rejected": -0.5012356638908386, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.9221726655960083, "rewards/margins": 0.0016753077507019043, "rewards/rejected": 0.9204973578453064, "step": 4390 }, { "epoch": 2.37, "learning_rate": 3.7648861988163374e-08, "logits/chosen": -2.0325682163238525, "logits/rejected": -2.2943918704986572, "logps/chosen": -0.5248889327049255, "logps/rejected": -0.4173341989517212, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.9381475448608398, "rewards/margins": 0.024997234344482422, "rewards/rejected": 0.9131503105163574, "step": 4391 }, { "epoch": 2.37, "learning_rate": 3.762770208188524e-08, "logits/chosen": -2.0459353923797607, "logits/rejected": -2.189422845840454, "logps/chosen": -0.9871655106544495, "logps/rejected": -0.9753504395484924, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.97506183385849, "rewards/margins": 0.018534958362579346, "rewards/rejected": 0.9565268754959106, "step": 4392 }, { "epoch": 2.37, "learning_rate": 3.7606544535704843e-08, "logits/chosen": -2.147052526473999, "logits/rejected": -2.1483726501464844, "logps/chosen": -2.9398014545440674, "logps/rejected": -4.066205024719238, "loss": 0.5476, "rewards/accuracies": 1.0, "rewards/chosen": 1.0575940608978271, "rewards/margins": 0.3158462643623352, "rewards/rejected": 0.7417477965354919, "step": 4393 }, { "epoch": 2.37, "learning_rate": 3.758538935365812e-08, "logits/chosen": -1.9787646532058716, "logits/rejected": -2.315898895263672, "logps/chosen": -3.0072741508483887, "logps/rejected": -2.7705740928649902, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 1.1161450147628784, "rewards/margins": 0.0066852569580078125, "rewards/rejected": 1.1094597578048706, "step": 4394 }, { "epoch": 2.37, "learning_rate": 3.756423653978056e-08, "logits/chosen": -2.193288803100586, "logits/rejected": -2.196192979812622, "logps/chosen": -0.24211488664150238, "logps/rejected": -5.424014568328857, "loss": 0.4068, "rewards/accuracies": 1.0, "rewards/chosen": 0.9462835192680359, "rewards/margins": 0.6891014575958252, "rewards/rejected": 0.2571820914745331, "step": 4395 }, { "epoch": 2.37, "learning_rate": 3.75430860981072e-08, "logits/chosen": -2.1220808029174805, "logits/rejected": -2.265118360519409, "logps/chosen": -4.467276573181152, "logps/rejected": -3.503162384033203, "loss": 0.5848, "rewards/accuracies": 1.0, "rewards/chosen": 1.128334879875183, "rewards/margins": 0.22990697622299194, "rewards/rejected": 0.8984279036521912, "step": 4396 }, { "epoch": 2.37, "learning_rate": 3.752193803267263e-08, "logits/chosen": -2.1722605228424072, "logits/rejected": -2.151625871658325, "logps/chosen": -12.190193176269531, "logps/rejected": -1.3451365232467651, "loss": 0.3971, "rewards/accuracies": 1.0, "rewards/chosen": 1.677864909172058, "rewards/margins": 0.7183799147605896, "rewards/rejected": 0.9594849944114685, "step": 4397 }, { "epoch": 2.37, "learning_rate": 3.7500792347510976e-08, "logits/chosen": -2.128502130508423, "logits/rejected": -2.2995870113372803, "logps/chosen": -3.8107094764709473, "logps/rejected": -3.3934292793273926, "loss": 0.7097, "rewards/accuracies": 0.0, "rewards/chosen": 1.0090621709823608, "rewards/margins": -0.032770633697509766, "rewards/rejected": 1.0418328046798706, "step": 4398 }, { "epoch": 2.37, "learning_rate": 3.7479649046655924e-08, "logits/chosen": -2.1784865856170654, "logits/rejected": -2.1422340869903564, "logps/chosen": -24.296812057495117, "logps/rejected": -11.875503540039062, "loss": 0.3182, "rewards/accuracies": 1.0, "rewards/chosen": 2.063279151916504, "rewards/margins": 0.9817641973495483, "rewards/rejected": 1.0815149545669556, "step": 4399 }, { "epoch": 2.37, "learning_rate": 3.74585081341407e-08, "logits/chosen": -2.1109538078308105, "logits/rejected": -2.3443832397460938, "logps/chosen": -1.0958850383758545, "logps/rejected": -1.0265371799468994, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 1.0951894521713257, "rewards/margins": -0.008115530014038086, "rewards/rejected": 1.1033049821853638, "step": 4400 }, { "epoch": 2.37, "learning_rate": 3.743736961399807e-08, "logits/chosen": -2.07399845123291, "logits/rejected": -2.074666738510132, "logps/chosen": -1.4700911045074463, "logps/rejected": -1.4891083240509033, "loss": 0.4945, "rewards/accuracies": 1.0, "rewards/chosen": 1.3362891674041748, "rewards/margins": 0.446711003780365, "rewards/rejected": 0.8895781636238098, "step": 4401 }, { "epoch": 2.37, "learning_rate": 3.741623349026035e-08, "logits/chosen": -2.0360565185546875, "logits/rejected": -2.028420925140381, "logps/chosen": -26.19416046142578, "logps/rejected": -8.58271598815918, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": 2.196420669555664, "rewards/margins": 1.656720519065857, "rewards/rejected": 0.5397001504898071, "step": 4402 }, { "epoch": 2.37, "learning_rate": 3.7395099766959396e-08, "logits/chosen": -2.1618387699127197, "logits/rejected": -2.286184310913086, "logps/chosen": -2.0218589305877686, "logps/rejected": -0.651537299156189, "loss": 0.6462, "rewards/accuracies": 1.0, "rewards/chosen": 0.9807637333869934, "rewards/margins": 0.09616553783416748, "rewards/rejected": 0.8845981955528259, "step": 4403 }, { "epoch": 2.38, "learning_rate": 3.737396844812657e-08, "logits/chosen": -2.0063247680664062, "logits/rejected": -2.2638683319091797, "logps/chosen": -0.6692224144935608, "logps/rejected": -0.6297353506088257, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.9028399586677551, "rewards/margins": -0.011492133140563965, "rewards/rejected": 0.9143320918083191, "step": 4404 }, { "epoch": 2.38, "learning_rate": 3.735283953779287e-08, "logits/chosen": -2.199273109436035, "logits/rejected": -2.1612000465393066, "logps/chosen": -16.47323989868164, "logps/rejected": -3.669853687286377, "loss": 0.2434, "rewards/accuracies": 1.0, "rewards/chosen": 1.8061786890029907, "rewards/margins": 1.2890942096710205, "rewards/rejected": 0.5170844793319702, "step": 4405 }, { "epoch": 2.38, "learning_rate": 3.733171303998874e-08, "logits/chosen": -2.0132272243499756, "logits/rejected": -2.014317750930786, "logps/chosen": -0.7034325003623962, "logps/rejected": -7.384042263031006, "loss": 0.4346, "rewards/accuracies": 1.0, "rewards/chosen": 1.0993314981460571, "rewards/margins": 0.6081298589706421, "rewards/rejected": 0.49120163917541504, "step": 4406 }, { "epoch": 2.38, "learning_rate": 3.7310588958744215e-08, "logits/chosen": -2.1610052585601807, "logits/rejected": -2.3175148963928223, "logps/chosen": -0.32573506236076355, "logps/rejected": -0.31383460760116577, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 1.0701557397842407, "rewards/margins": 5.435943603515625e-05, "rewards/rejected": 1.0701013803482056, "step": 4407 }, { "epoch": 2.38, "learning_rate": 3.728946729808884e-08, "logits/chosen": -2.103184700012207, "logits/rejected": -2.1016383171081543, "logps/chosen": -2.2731826305389404, "logps/rejected": -6.409656524658203, "loss": 0.3484, "rewards/accuracies": 1.0, "rewards/chosen": 1.330923318862915, "rewards/margins": 0.8751490116119385, "rewards/rejected": 0.45577430725097656, "step": 4408 }, { "epoch": 2.38, "learning_rate": 3.726834806205173e-08, "logits/chosen": -2.2989864349365234, "logits/rejected": -2.1389925479888916, "logps/chosen": -25.291528701782227, "logps/rejected": -4.339184284210205, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": 2.470522165298462, "rewards/margins": 1.9933037757873535, "rewards/rejected": 0.4772183895111084, "step": 4409 }, { "epoch": 2.38, "learning_rate": 3.72472312546615e-08, "logits/chosen": -2.2195193767547607, "logits/rejected": -2.2218973636627197, "logps/chosen": -0.17124763131141663, "logps/rejected": -5.241481304168701, "loss": 0.4079, "rewards/accuracies": 1.0, "rewards/chosen": 1.0552349090576172, "rewards/margins": 0.6858900785446167, "rewards/rejected": 0.3693448603153229, "step": 4410 }, { "epoch": 2.38, "learning_rate": 3.722611687994632e-08, "logits/chosen": -2.214979410171509, "logits/rejected": -2.0763182640075684, "logps/chosen": -40.769874572753906, "logps/rejected": -3.3339476585388184, "loss": 0.0951, "rewards/accuracies": 1.0, "rewards/chosen": 2.9917290210723877, "rewards/margins": 2.305202007293701, "rewards/rejected": 0.6865271329879761, "step": 4411 }, { "epoch": 2.38, "learning_rate": 3.720500494193391e-08, "logits/chosen": -2.1596901416778564, "logits/rejected": -2.324608325958252, "logps/chosen": -1.4202436208724976, "logps/rejected": -1.0240414142608643, "loss": 0.6472, "rewards/accuracies": 1.0, "rewards/chosen": 0.9736008644104004, "rewards/margins": 0.09407585859298706, "rewards/rejected": 0.8795250058174133, "step": 4412 }, { "epoch": 2.38, "learning_rate": 3.718389544465151e-08, "logits/chosen": -2.237299919128418, "logits/rejected": -2.0854568481445312, "logps/chosen": -33.484153747558594, "logps/rejected": -1.7098041772842407, "loss": 0.1321, "rewards/accuracies": 1.0, "rewards/chosen": 2.910950183868408, "rewards/margins": 1.9570602178573608, "rewards/rejected": 0.9538899660110474, "step": 4413 }, { "epoch": 2.38, "learning_rate": 3.716278839212589e-08, "logits/chosen": -2.126276731491089, "logits/rejected": -2.2871570587158203, "logps/chosen": -10.614721298217773, "logps/rejected": -8.885335922241211, "loss": 0.4689, "rewards/accuracies": 1.0, "rewards/chosen": 1.4138176441192627, "rewards/margins": 0.5136837363243103, "rewards/rejected": 0.9001339077949524, "step": 4414 }, { "epoch": 2.38, "learning_rate": 3.714168378838335e-08, "logits/chosen": -2.154045581817627, "logits/rejected": -2.1534934043884277, "logps/chosen": -6.668325424194336, "logps/rejected": -2.3129963874816895, "loss": 0.4553, "rewards/accuracies": 1.0, "rewards/chosen": 1.2869125604629517, "rewards/margins": 0.5505063533782959, "rewards/rejected": 0.7364062070846558, "step": 4415 }, { "epoch": 2.38, "learning_rate": 3.712058163744975e-08, "logits/chosen": -2.1926279067993164, "logits/rejected": -2.2377231121063232, "logps/chosen": -1.2957152128219604, "logps/rejected": -1.2373731136322021, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 1.053423523902893, "rewards/margins": 0.008173704147338867, "rewards/rejected": 1.0452498197555542, "step": 4416 }, { "epoch": 2.38, "learning_rate": 3.709948194335047e-08, "logits/chosen": -2.1689515113830566, "logits/rejected": -2.2700607776641846, "logps/chosen": -2.858948230743408, "logps/rejected": -2.883007049560547, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 1.1061468124389648, "rewards/margins": 0.023543119430541992, "rewards/rejected": 1.0826036930084229, "step": 4417 }, { "epoch": 2.38, "learning_rate": 3.70783847101104e-08, "logits/chosen": -2.167945146560669, "logits/rejected": -2.334221601486206, "logps/chosen": -1.763650894165039, "logps/rejected": -1.9917998313903809, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 0.769797146320343, "rewards/margins": 0.043814897537231445, "rewards/rejected": 0.7259822487831116, "step": 4418 }, { "epoch": 2.38, "learning_rate": 3.705728994175399e-08, "logits/chosen": -2.1637330055236816, "logits/rejected": -2.1570467948913574, "logps/chosen": -2.1979072093963623, "logps/rejected": -2.8798043727874756, "loss": 0.5145, "rewards/accuracies": 1.0, "rewards/chosen": 1.1674307584762573, "rewards/margins": 0.3964267373085022, "rewards/rejected": 0.7710040211677551, "step": 4419 }, { "epoch": 2.38, "learning_rate": 3.703619764230519e-08, "logits/chosen": -1.899182677268982, "logits/rejected": -2.2824783325195312, "logps/chosen": -0.7119759917259216, "logps/rejected": -0.7803167700767517, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 1.0830401182174683, "rewards/margins": 0.0002911090850830078, "rewards/rejected": 1.0827490091323853, "step": 4420 }, { "epoch": 2.38, "learning_rate": 3.701510781578752e-08, "logits/chosen": -2.075732707977295, "logits/rejected": -2.2785661220550537, "logps/chosen": -0.33731192350387573, "logps/rejected": -0.32532036304473877, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.867419421672821, "rewards/margins": 0.023276031017303467, "rewards/rejected": 0.8441433906555176, "step": 4421 }, { "epoch": 2.39, "learning_rate": 3.6994020466223974e-08, "logits/chosen": -2.170114517211914, "logits/rejected": -2.2969424724578857, "logps/chosen": -4.701776504516602, "logps/rejected": -0.4207703471183777, "loss": 0.7174, "rewards/accuracies": 0.0, "rewards/chosen": 0.9403432011604309, "rewards/margins": -0.04791831970214844, "rewards/rejected": 0.9882615208625793, "step": 4422 }, { "epoch": 2.39, "learning_rate": 3.697293559763713e-08, "logits/chosen": -2.164112091064453, "logits/rejected": -2.145779609680176, "logps/chosen": -14.371435165405273, "logps/rejected": -4.146574020385742, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 1.4309908151626587, "rewards/margins": 1.0296719074249268, "rewards/rejected": 0.4013189375400543, "step": 4423 }, { "epoch": 2.39, "learning_rate": 3.695185321404906e-08, "logits/chosen": -1.9680328369140625, "logits/rejected": -2.2628703117370605, "logps/chosen": -0.8501471877098083, "logps/rejected": -0.9261363744735718, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.9514902234077454, "rewards/margins": 0.023123860359191895, "rewards/rejected": 0.9283663630485535, "step": 4424 }, { "epoch": 2.39, "learning_rate": 3.693077331948136e-08, "logits/chosen": -2.1086061000823975, "logits/rejected": -2.0940377712249756, "logps/chosen": -16.801179885864258, "logps/rejected": -4.750883102416992, "loss": 0.2553, "rewards/accuracies": 1.0, "rewards/chosen": 1.5655192136764526, "rewards/margins": 1.2350019216537476, "rewards/rejected": 0.3305172920227051, "step": 4425 }, { "epoch": 2.39, "learning_rate": 3.6909695917955145e-08, "logits/chosen": -1.969978928565979, "logits/rejected": -1.977426528930664, "logps/chosen": -1.5197380781173706, "logps/rejected": -3.281327962875366, "loss": 0.4512, "rewards/accuracies": 1.0, "rewards/chosen": 1.1342653036117554, "rewards/margins": 0.5617074370384216, "rewards/rejected": 0.5725578665733337, "step": 4426 }, { "epoch": 2.39, "learning_rate": 3.6888621013491106e-08, "logits/chosen": -2.040095090866089, "logits/rejected": -2.042300224304199, "logps/chosen": -3.273327112197876, "logps/rejected": -4.6543779373168945, "loss": 0.4763, "rewards/accuracies": 1.0, "rewards/chosen": 1.042062520980835, "rewards/margins": 0.4940844774246216, "rewards/rejected": 0.5479780435562134, "step": 4427 }, { "epoch": 2.39, "learning_rate": 3.686754861010939e-08, "logits/chosen": -2.1332030296325684, "logits/rejected": -2.1291308403015137, "logps/chosen": -0.7194373607635498, "logps/rejected": -6.459498405456543, "loss": 0.498, "rewards/accuracies": 1.0, "rewards/chosen": 1.0777437686920166, "rewards/margins": 0.43778032064437866, "rewards/rejected": 0.6399634480476379, "step": 4428 }, { "epoch": 2.39, "learning_rate": 3.6846478711829725e-08, "logits/chosen": -2.047675848007202, "logits/rejected": -2.0419557094573975, "logps/chosen": -2.786447286605835, "logps/rejected": -3.6061952114105225, "loss": 0.5573, "rewards/accuracies": 1.0, "rewards/chosen": 0.9784698486328125, "rewards/margins": 0.29301267862319946, "rewards/rejected": 0.685457170009613, "step": 4429 }, { "epoch": 2.39, "learning_rate": 3.6825411322671314e-08, "logits/chosen": -2.105470895767212, "logits/rejected": -2.3448374271392822, "logps/chosen": -12.887182235717773, "logps/rejected": -6.125392913818359, "loss": 0.8726, "rewards/accuracies": 0.0, "rewards/chosen": 0.6346502304077148, "rewards/margins": -0.33158648014068604, "rewards/rejected": 0.9662367105484009, "step": 4430 }, { "epoch": 2.39, "learning_rate": 3.68043464466529e-08, "logits/chosen": -2.077025890350342, "logits/rejected": -2.343191623687744, "logps/chosen": -0.28093478083610535, "logps/rejected": -0.2931475043296814, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 1.0791208744049072, "rewards/margins": -0.010183930397033691, "rewards/rejected": 1.089304804801941, "step": 4431 }, { "epoch": 2.39, "learning_rate": 3.678328408779275e-08, "logits/chosen": -2.0379223823547363, "logits/rejected": -2.295041561126709, "logps/chosen": -0.6497287750244141, "logps/rejected": -0.6336853504180908, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.9933714270591736, "rewards/margins": 0.04275476932525635, "rewards/rejected": 0.9506166577339172, "step": 4432 }, { "epoch": 2.39, "learning_rate": 3.676222425010867e-08, "logits/chosen": -2.051239490509033, "logits/rejected": -2.2831459045410156, "logps/chosen": -0.8872386813163757, "logps/rejected": -0.8326825499534607, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 1.012104868888855, "rewards/margins": 5.829334259033203e-05, "rewards/rejected": 1.0120465755462646, "step": 4433 }, { "epoch": 2.39, "learning_rate": 3.674116693761793e-08, "logits/chosen": -2.0775911808013916, "logits/rejected": -2.281132221221924, "logps/chosen": -0.4243806004524231, "logps/rejected": -0.4390278458595276, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 1.0344009399414062, "rewards/margins": 0.004187345504760742, "rewards/rejected": 1.0302135944366455, "step": 4434 }, { "epoch": 2.39, "learning_rate": 3.672011215433737e-08, "logits/chosen": -2.134860038757324, "logits/rejected": -2.140437602996826, "logps/chosen": -1.414308786392212, "logps/rejected": -2.0232856273651123, "loss": 0.4351, "rewards/accuracies": 1.0, "rewards/chosen": 1.1981533765792847, "rewards/margins": 0.6066541075706482, "rewards/rejected": 0.5914992690086365, "step": 4435 }, { "epoch": 2.39, "learning_rate": 3.6699059904283314e-08, "logits/chosen": -2.033857583999634, "logits/rejected": -2.0288896560668945, "logps/chosen": -2.529784917831421, "logps/rejected": -4.854581832885742, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 1.565037727355957, "rewards/margins": 1.170268177986145, "rewards/rejected": 0.3947695791721344, "step": 4436 }, { "epoch": 2.39, "learning_rate": 3.667801019147162e-08, "logits/chosen": -2.0794670581817627, "logits/rejected": -2.0777671337127686, "logps/chosen": -1.1961698532104492, "logps/rejected": -1.785929799079895, "loss": 0.5198, "rewards/accuracies": 1.0, "rewards/chosen": 1.1189841032028198, "rewards/margins": 0.383095920085907, "rewards/rejected": 0.7358881831169128, "step": 4437 }, { "epoch": 2.39, "learning_rate": 3.665696301991769e-08, "logits/chosen": -2.024789810180664, "logits/rejected": -2.0068371295928955, "logps/chosen": -3.6597979068756104, "logps/rejected": -7.003905773162842, "loss": 0.4114, "rewards/accuracies": 1.0, "rewards/chosen": 1.1222829818725586, "rewards/margins": 0.675480842590332, "rewards/rejected": 0.4468021094799042, "step": 4438 }, { "epoch": 2.39, "learning_rate": 3.663591839363639e-08, "logits/chosen": -1.9740256071090698, "logits/rejected": -2.2618722915649414, "logps/chosen": -0.5576085448265076, "logps/rejected": -0.5880628824234009, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.8519026637077332, "rewards/margins": 0.02158421277999878, "rewards/rejected": 0.8303184509277344, "step": 4439 }, { "epoch": 2.39, "learning_rate": 3.661487631664212e-08, "logits/chosen": -2.2107439041137695, "logits/rejected": -2.3215012550354004, "logps/chosen": -0.8151847124099731, "logps/rejected": -0.8837494850158691, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.9645038843154907, "rewards/margins": 0.011684238910675049, "rewards/rejected": 0.9528196454048157, "step": 4440 }, { "epoch": 2.4, "learning_rate": 3.659383679294879e-08, "logits/chosen": -2.1841349601745605, "logits/rejected": -2.089221477508545, "logps/chosen": -18.3464412689209, "logps/rejected": -4.996200084686279, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 2.5622079372406006, "rewards/margins": 2.201589345932007, "rewards/rejected": 0.3606186509132385, "step": 4441 }, { "epoch": 2.4, "learning_rate": 3.657279982656985e-08, "logits/chosen": -1.9683657884597778, "logits/rejected": -2.2897684574127197, "logps/chosen": -0.530341386795044, "logps/rejected": -0.591877281665802, "loss": 0.7143, "rewards/accuracies": 0.0, "rewards/chosen": 1.0704771280288696, "rewards/margins": -0.041774749755859375, "rewards/rejected": 1.112251877784729, "step": 4442 }, { "epoch": 2.4, "learning_rate": 3.6551765421518214e-08, "logits/chosen": -2.23291277885437, "logits/rejected": -2.2811367511749268, "logps/chosen": -2.4726622104644775, "logps/rejected": -2.555088758468628, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.8004885911941528, "rewards/margins": 0.033433616161346436, "rewards/rejected": 0.7670549750328064, "step": 4443 }, { "epoch": 2.4, "learning_rate": 3.653073358180635e-08, "logits/chosen": -2.0994811058044434, "logits/rejected": -2.2991180419921875, "logps/chosen": -0.5034255385398865, "logps/rejected": -0.5993707776069641, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 1.11333167552948, "rewards/margins": -0.018004179000854492, "rewards/rejected": 1.1313358545303345, "step": 4444 }, { "epoch": 2.4, "learning_rate": 3.650970431144622e-08, "logits/chosen": -2.0825130939483643, "logits/rejected": -2.2592718601226807, "logps/chosen": -2.291250467300415, "logps/rejected": -2.6276910305023193, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.7091740965843201, "rewards/margins": -0.010468721389770508, "rewards/rejected": 0.7196428179740906, "step": 4445 }, { "epoch": 2.4, "learning_rate": 3.6488677614449297e-08, "logits/chosen": -1.997371792793274, "logits/rejected": -2.291560173034668, "logps/chosen": -1.9625900983810425, "logps/rejected": -1.0307681560516357, "loss": 0.6645, "rewards/accuracies": 1.0, "rewards/chosen": 0.8174152374267578, "rewards/margins": 0.05804431438446045, "rewards/rejected": 0.7593709230422974, "step": 4446 }, { "epoch": 2.4, "learning_rate": 3.6467653494826554e-08, "logits/chosen": -2.1090683937072754, "logits/rejected": -2.2601206302642822, "logps/chosen": -0.36150574684143066, "logps/rejected": -0.39261430501937866, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.8913373351097107, "rewards/margins": 0.014231204986572266, "rewards/rejected": 0.8771061301231384, "step": 4447 }, { "epoch": 2.4, "learning_rate": 3.644663195658848e-08, "logits/chosen": -2.1492881774902344, "logits/rejected": -2.2983922958374023, "logps/chosen": -3.035475254058838, "logps/rejected": -3.3444645404815674, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.9139888882637024, "rewards/margins": 0.023846328258514404, "rewards/rejected": 0.890142560005188, "step": 4448 }, { "epoch": 2.4, "learning_rate": 3.6425613003745094e-08, "logits/chosen": -2.166264533996582, "logits/rejected": -2.3131966590881348, "logps/chosen": -0.15561246871948242, "logps/rejected": -0.15613965690135956, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9587621688842773, "rewards/margins": 0.018004238605499268, "rewards/rejected": 0.9407579302787781, "step": 4449 }, { "epoch": 2.4, "learning_rate": 3.640459664030587e-08, "logits/chosen": -2.167935371398926, "logits/rejected": -2.075770616531372, "logps/chosen": -16.226646423339844, "logps/rejected": -1.4878721237182617, "loss": 0.2703, "rewards/accuracies": 1.0, "rewards/chosen": 2.0202279090881348, "rewards/margins": 1.1698524951934814, "rewards/rejected": 0.8503754734992981, "step": 4450 }, { "epoch": 2.4, "learning_rate": 3.638358287027985e-08, "logits/chosen": -2.0340256690979004, "logits/rejected": -2.276146411895752, "logps/chosen": -5.002679347991943, "logps/rejected": -1.1409326791763306, "loss": 0.7766, "rewards/accuracies": 0.0, "rewards/chosen": 0.7545053958892822, "rewards/margins": -0.16050994396209717, "rewards/rejected": 0.9150153398513794, "step": 4451 }, { "epoch": 2.4, "learning_rate": 3.6362571697675533e-08, "logits/chosen": -2.054368734359741, "logits/rejected": -2.2941877841949463, "logps/chosen": -0.24480018019676208, "logps/rejected": -0.23465150594711304, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.8310900926589966, "rewards/margins": 0.03647893667221069, "rewards/rejected": 0.7946111559867859, "step": 4452 }, { "epoch": 2.4, "learning_rate": 3.634156312650093e-08, "logits/chosen": -2.0617802143096924, "logits/rejected": -2.0603220462799072, "logps/chosen": -5.7341814041137695, "logps/rejected": -8.506393432617188, "loss": 0.3137, "rewards/accuracies": 1.0, "rewards/chosen": 1.2333295345306396, "rewards/margins": 0.9982081055641174, "rewards/rejected": 0.2351214438676834, "step": 4453 }, { "epoch": 2.4, "learning_rate": 3.632055716076359e-08, "logits/chosen": -2.071023464202881, "logits/rejected": -2.370453119277954, "logps/chosen": -6.556442737579346, "logps/rejected": -7.7025299072265625, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 1.108355164527893, "rewards/margins": -0.015500664710998535, "rewards/rejected": 1.1238558292388916, "step": 4454 }, { "epoch": 2.4, "learning_rate": 3.629955380447051e-08, "logits/chosen": -2.1392033100128174, "logits/rejected": -2.12968111038208, "logps/chosen": -13.210770606994629, "logps/rejected": -13.634857177734375, "loss": 0.3355, "rewards/accuracies": 1.0, "rewards/chosen": 1.431904673576355, "rewards/margins": 0.9198668003082275, "rewards/rejected": 0.5120378732681274, "step": 4455 }, { "epoch": 2.4, "learning_rate": 3.627855306162825e-08, "logits/chosen": -1.9926142692565918, "logits/rejected": -2.3251240253448486, "logps/chosen": -0.4617243707180023, "logps/rejected": -0.4221583604812622, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 1.0607366561889648, "rewards/margins": 0.01996028423309326, "rewards/rejected": 1.0407763719558716, "step": 4456 }, { "epoch": 2.4, "learning_rate": 3.625755493624282e-08, "logits/chosen": -2.0677664279937744, "logits/rejected": -2.1085519790649414, "logps/chosen": -3.281888961791992, "logps/rejected": -9.43268871307373, "loss": 0.2459, "rewards/accuracies": 1.0, "rewards/chosen": 1.7565683126449585, "rewards/margins": 1.2771795988082886, "rewards/rejected": 0.4793887138366699, "step": 4457 }, { "epoch": 2.4, "learning_rate": 3.6236559432319744e-08, "logits/chosen": -2.01125168800354, "logits/rejected": -2.017585277557373, "logps/chosen": -2.437666654586792, "logps/rejected": -1.6164114475250244, "loss": 0.5514, "rewards/accuracies": 1.0, "rewards/chosen": 1.158166766166687, "rewards/margins": 0.30685657262802124, "rewards/rejected": 0.8513101935386658, "step": 4458 }, { "epoch": 2.41, "learning_rate": 3.6215566553864076e-08, "logits/chosen": -2.1261637210845947, "logits/rejected": -2.3159985542297363, "logps/chosen": -4.128374099731445, "logps/rejected": -4.212923526763916, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.48904475569725037, "rewards/margins": 0.01936429738998413, "rewards/rejected": 0.46968045830726624, "step": 4459 }, { "epoch": 2.41, "learning_rate": 3.6194576304880316e-08, "logits/chosen": -1.9897831678390503, "logits/rejected": -1.9913212060928345, "logps/chosen": -0.30963465571403503, "logps/rejected": -4.001358509063721, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 1.0004786252975464, "rewards/margins": 0.4021035432815552, "rewards/rejected": 0.5983750820159912, "step": 4460 }, { "epoch": 2.41, "learning_rate": 3.617358868937253e-08, "logits/chosen": -2.0050008296966553, "logits/rejected": -2.255265712738037, "logps/chosen": -0.8433904051780701, "logps/rejected": -0.7511903643608093, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 1.0315412282943726, "rewards/margins": 0.014083623886108398, "rewards/rejected": 1.0174576044082642, "step": 4461 }, { "epoch": 2.41, "learning_rate": 3.615260371134424e-08, "logits/chosen": -2.0282082557678223, "logits/rejected": -2.0354714393615723, "logps/chosen": -1.632549524307251, "logps/rejected": -3.3569650650024414, "loss": 0.4634, "rewards/accuracies": 1.0, "rewards/chosen": 1.06569242477417, "rewards/margins": 0.5286545753479004, "rewards/rejected": 0.5370378494262695, "step": 4462 }, { "epoch": 2.41, "learning_rate": 3.613162137479845e-08, "logits/chosen": -2.0170977115631104, "logits/rejected": -2.2599756717681885, "logps/chosen": -0.24540400505065918, "logps/rejected": -0.2986339330673218, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 1.0148745775222778, "rewards/margins": 0.01729583740234375, "rewards/rejected": 0.9975787401199341, "step": 4463 }, { "epoch": 2.41, "learning_rate": 3.611064168373768e-08, "logits/chosen": -1.9896836280822754, "logits/rejected": -2.2752270698547363, "logps/chosen": -1.1771527528762817, "logps/rejected": -1.033711552619934, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.840162456035614, "rewards/margins": 0.024118423461914062, "rewards/rejected": 0.8160440325737, "step": 4464 }, { "epoch": 2.41, "learning_rate": 3.608966464216396e-08, "logits/chosen": -2.333307981491089, "logits/rejected": -2.1868062019348145, "logps/chosen": -33.47630310058594, "logps/rejected": -1.776658296585083, "loss": 0.1471, "rewards/accuracies": 1.0, "rewards/chosen": 2.7394371032714844, "rewards/margins": 1.8424854278564453, "rewards/rejected": 0.8969516754150391, "step": 4465 }, { "epoch": 2.41, "learning_rate": 3.606869025407878e-08, "logits/chosen": -2.197195291519165, "logits/rejected": -2.187380075454712, "logps/chosen": -0.33936142921447754, "logps/rejected": -0.35952886939048767, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.8761026263237, "rewards/margins": 0.005218088626861572, "rewards/rejected": 0.8708845376968384, "step": 4466 }, { "epoch": 2.41, "learning_rate": 3.6047718523483165e-08, "logits/chosen": -2.0522875785827637, "logits/rejected": -2.1352734565734863, "logps/chosen": -1.878610372543335, "logps/rejected": -10.44274616241455, "loss": 0.3325, "rewards/accuracies": 1.0, "rewards/chosen": 1.5452778339385986, "rewards/margins": 0.9301188588142395, "rewards/rejected": 0.6151589751243591, "step": 4467 }, { "epoch": 2.41, "learning_rate": 3.60267494543776e-08, "logits/chosen": -2.1935415267944336, "logits/rejected": -2.310168981552124, "logps/chosen": -3.96897029876709, "logps/rejected": -0.3815803527832031, "loss": 0.7322, "rewards/accuracies": 0.0, "rewards/chosen": 0.9561148881912231, "rewards/margins": -0.07658874988555908, "rewards/rejected": 1.0327036380767822, "step": 4468 }, { "epoch": 2.41, "learning_rate": 3.6005783050762074e-08, "logits/chosen": -2.016399621963501, "logits/rejected": -2.297828435897827, "logps/chosen": -1.0189276933670044, "logps/rejected": -1.114283561706543, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.6725412011146545, "rewards/margins": 0.006002545356750488, "rewards/rejected": 0.666538655757904, "step": 4469 }, { "epoch": 2.41, "learning_rate": 3.5984819316636076e-08, "logits/chosen": -2.0972378253936768, "logits/rejected": -2.3468167781829834, "logps/chosen": -5.685329437255859, "logps/rejected": -4.614518165588379, "loss": 0.7254, "rewards/accuracies": 0.0, "rewards/chosen": 0.919940173625946, "rewards/margins": -0.06359076499938965, "rewards/rejected": 0.9835309386253357, "step": 4470 }, { "epoch": 2.41, "learning_rate": 3.596385825599855e-08, "logits/chosen": -2.20873761177063, "logits/rejected": -2.207934856414795, "logps/chosen": -1.9181331396102905, "logps/rejected": -5.539600372314453, "loss": 0.4313, "rewards/accuracies": 1.0, "rewards/chosen": 1.0759257078170776, "rewards/margins": 0.6175607442855835, "rewards/rejected": 0.45836496353149414, "step": 4471 }, { "epoch": 2.41, "learning_rate": 3.594289987284798e-08, "logits/chosen": -2.137442111968994, "logits/rejected": -2.297574520111084, "logps/chosen": -4.479624271392822, "logps/rejected": -10.2665376663208, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 1.033452033996582, "rewards/margins": 0.02039968967437744, "rewards/rejected": 1.0130523443222046, "step": 4472 }, { "epoch": 2.41, "learning_rate": 3.5921944171182326e-08, "logits/chosen": -2.0863654613494873, "logits/rejected": -2.2495827674865723, "logps/chosen": -4.8453145027160645, "logps/rejected": -0.4740749001502991, "loss": 0.7674, "rewards/accuracies": 0.0, "rewards/chosen": 0.7132266759872437, "rewards/margins": -0.1434301733970642, "rewards/rejected": 0.8566568493843079, "step": 4473 }, { "epoch": 2.41, "learning_rate": 3.590099115499901e-08, "logits/chosen": -2.1256604194641113, "logits/rejected": -2.038764476776123, "logps/chosen": -22.40719985961914, "logps/rejected": -2.675452947616577, "loss": 0.2665, "rewards/accuracies": 1.0, "rewards/chosen": 1.7971619367599487, "rewards/margins": 1.1860198974609375, "rewards/rejected": 0.6111419796943665, "step": 4474 }, { "epoch": 2.41, "learning_rate": 3.588004082829496e-08, "logits/chosen": -1.985304832458496, "logits/rejected": -2.30289363861084, "logps/chosen": -2.184237480163574, "logps/rejected": -11.525331497192383, "loss": 0.5758, "rewards/accuracies": 1.0, "rewards/chosen": 0.9515236020088196, "rewards/margins": 0.2502861022949219, "rewards/rejected": 0.7012374997138977, "step": 4475 }, { "epoch": 2.41, "learning_rate": 3.585909319506659e-08, "logits/chosen": -2.0698492527008057, "logits/rejected": -2.1351253986358643, "logps/chosen": -10.490560531616211, "logps/rejected": -6.566110610961914, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 1.1550992727279663, "rewards/margins": 0.04204976558685303, "rewards/rejected": 1.1130495071411133, "step": 4476 }, { "epoch": 2.41, "learning_rate": 3.58381482593098e-08, "logits/chosen": -2.187898874282837, "logits/rejected": -2.2243540287017822, "logps/chosen": -11.416872024536133, "logps/rejected": -11.917619705200195, "loss": 0.4357, "rewards/accuracies": 1.0, "rewards/chosen": 1.3650238513946533, "rewards/margins": 0.6051633954048157, "rewards/rejected": 0.7598604559898376, "step": 4477 }, { "epoch": 2.42, "learning_rate": 3.5817206025019985e-08, "logits/chosen": -2.189835786819458, "logits/rejected": -2.352642297744751, "logps/chosen": -1.4711958169937134, "logps/rejected": -1.5744116306304932, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.9713422656059265, "rewards/margins": 0.025546729564666748, "rewards/rejected": 0.9457955360412598, "step": 4478 }, { "epoch": 2.42, "learning_rate": 3.579626649619201e-08, "logits/chosen": -2.198276996612549, "logits/rejected": -2.085103750228882, "logps/chosen": -36.333343505859375, "logps/rejected": -2.953106164932251, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 2.6474595069885254, "rewards/margins": 2.093731164932251, "rewards/rejected": 0.5537284016609192, "step": 4479 }, { "epoch": 2.42, "learning_rate": 3.577532967682022e-08, "logits/chosen": -2.081885814666748, "logits/rejected": -2.064570903778076, "logps/chosen": -16.31903076171875, "logps/rejected": -4.805138111114502, "loss": 0.3885, "rewards/accuracies": 1.0, "rewards/chosen": 1.6527023315429688, "rewards/margins": 0.7450761795043945, "rewards/rejected": 0.9076261520385742, "step": 4480 }, { "epoch": 2.42, "learning_rate": 3.575439557089845e-08, "logits/chosen": -2.016192674636841, "logits/rejected": -2.0150129795074463, "logps/chosen": -0.6791779398918152, "logps/rejected": -3.7943856716156006, "loss": 0.4452, "rewards/accuracies": 1.0, "rewards/chosen": 1.0390325784683228, "rewards/margins": 0.5782743692398071, "rewards/rejected": 0.4607582092285156, "step": 4481 }, { "epoch": 2.42, "learning_rate": 3.573346418242001e-08, "logits/chosen": -2.077892303466797, "logits/rejected": -2.0745716094970703, "logps/chosen": -6.493126392364502, "logps/rejected": -5.657758712768555, "loss": 0.6421, "rewards/accuracies": 1.0, "rewards/chosen": 1.1804355382919312, "rewards/margins": 0.10478949546813965, "rewards/rejected": 1.0756460428237915, "step": 4482 }, { "epoch": 2.42, "learning_rate": 3.571253551537774e-08, "logits/chosen": -2.0345194339752197, "logits/rejected": -2.040253162384033, "logps/chosen": -0.6536292433738708, "logps/rejected": -3.8370420932769775, "loss": 0.4722, "rewards/accuracies": 1.0, "rewards/chosen": 0.9918802380561829, "rewards/margins": 0.5051000714302063, "rewards/rejected": 0.48678016662597656, "step": 4483 }, { "epoch": 2.42, "learning_rate": 3.569160957376389e-08, "logits/chosen": -2.157649040222168, "logits/rejected": -2.1548848152160645, "logps/chosen": -5.982895851135254, "logps/rejected": -4.344125270843506, "loss": 0.3423, "rewards/accuracies": 1.0, "rewards/chosen": 1.3815219402313232, "rewards/margins": 0.89592444896698, "rewards/rejected": 0.4855974614620209, "step": 4484 }, { "epoch": 2.42, "learning_rate": 3.5670686361570236e-08, "logits/chosen": -2.1920201778411865, "logits/rejected": -2.354602098464966, "logps/chosen": -0.9041545987129211, "logps/rejected": -6.3498663902282715, "loss": 0.5867, "rewards/accuracies": 1.0, "rewards/chosen": 1.099624752998352, "rewards/margins": 0.2256215214729309, "rewards/rejected": 0.8740032315254211, "step": 4485 }, { "epoch": 2.42, "learning_rate": 3.564976588278803e-08, "logits/chosen": -2.066720962524414, "logits/rejected": -2.067359685897827, "logps/chosen": -1.3130136728286743, "logps/rejected": -0.868887186050415, "loss": 0.3586, "rewards/accuracies": 1.0, "rewards/chosen": 1.5674954652786255, "rewards/margins": 0.8408200144767761, "rewards/rejected": 0.7266754508018494, "step": 4486 }, { "epoch": 2.42, "learning_rate": 3.562884814140796e-08, "logits/chosen": -2.076301336288452, "logits/rejected": -2.0716283321380615, "logps/chosen": -9.4140625, "logps/rejected": -1.8341785669326782, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": 1.161481261253357, "rewards/margins": 0.07399463653564453, "rewards/rejected": 1.0874866247177124, "step": 4487 }, { "epoch": 2.42, "learning_rate": 3.5607933141420236e-08, "logits/chosen": -2.0342798233032227, "logits/rejected": -2.274444341659546, "logps/chosen": -0.2724083662033081, "logps/rejected": -0.2945113182067871, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": 0.9684039950370789, "rewards/margins": -0.011092424392700195, "rewards/rejected": 0.979496419429779, "step": 4488 }, { "epoch": 2.42, "learning_rate": 3.5587020886814544e-08, "logits/chosen": -2.1742331981658936, "logits/rejected": -2.175226926803589, "logps/chosen": -4.262356281280518, "logps/rejected": -10.178688049316406, "loss": 0.2022, "rewards/accuracies": 1.0, "rewards/chosen": 1.923073410987854, "rewards/margins": 1.4959239959716797, "rewards/rejected": 0.42714938521385193, "step": 4489 }, { "epoch": 2.42, "learning_rate": 3.556611138158002e-08, "logits/chosen": -2.1246209144592285, "logits/rejected": -2.110646963119507, "logps/chosen": -14.095561981201172, "logps/rejected": -6.7836408615112305, "loss": 0.2591, "rewards/accuracies": 1.0, "rewards/chosen": 1.51219642162323, "rewards/margins": 1.218252182006836, "rewards/rejected": 0.29394418001174927, "step": 4490 }, { "epoch": 2.42, "learning_rate": 3.554520462970531e-08, "logits/chosen": -2.1400179862976074, "logits/rejected": -2.2931506633758545, "logps/chosen": -0.8646921515464783, "logps/rejected": -0.8304231762886047, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9694916009902954, "rewards/margins": 0.016565918922424316, "rewards/rejected": 0.9529256820678711, "step": 4491 }, { "epoch": 2.42, "learning_rate": 3.552430063517849e-08, "logits/chosen": -2.0040342807769775, "logits/rejected": -2.0155935287475586, "logps/chosen": -6.428219795227051, "logps/rejected": -1.3996821641921997, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 1.1903456449508667, "rewards/margins": 0.03318607807159424, "rewards/rejected": 1.1571595668792725, "step": 4492 }, { "epoch": 2.42, "learning_rate": 3.550339940198714e-08, "logits/chosen": -2.0357182025909424, "logits/rejected": -2.2574973106384277, "logps/chosen": -9.209269523620605, "logps/rejected": -5.429047107696533, "loss": 0.7877, "rewards/accuracies": 0.0, "rewards/chosen": 0.8492478728294373, "rewards/margins": -0.1809520125389099, "rewards/rejected": 1.0301998853683472, "step": 4493 }, { "epoch": 2.42, "learning_rate": 3.548250093411833e-08, "logits/chosen": -2.0856516361236572, "logits/rejected": -2.2293524742126465, "logps/chosen": -0.6351317167282104, "logps/rejected": -0.6442435383796692, "loss": 0.6575, "rewards/accuracies": 1.0, "rewards/chosen": 0.9776625633239746, "rewards/margins": 0.07266902923583984, "rewards/rejected": 0.9049935340881348, "step": 4494 }, { "epoch": 2.42, "learning_rate": 3.5461605235558566e-08, "logits/chosen": -2.054164409637451, "logits/rejected": -2.054654836654663, "logps/chosen": -3.0586650371551514, "logps/rejected": -6.324072360992432, "loss": 0.2782, "rewards/accuracies": 1.0, "rewards/chosen": 1.6214300394058228, "rewards/margins": 1.1369175910949707, "rewards/rejected": 0.4845123887062073, "step": 4495 }, { "epoch": 2.43, "learning_rate": 3.5440712310293845e-08, "logits/chosen": -2.0412323474884033, "logits/rejected": -2.0431041717529297, "logps/chosen": -2.696059465408325, "logps/rejected": -0.5823529958724976, "loss": 0.6414, "rewards/accuracies": 1.0, "rewards/chosen": 1.0879274606704712, "rewards/margins": 0.10624021291732788, "rewards/rejected": 0.9816872477531433, "step": 4496 }, { "epoch": 2.43, "learning_rate": 3.541982216230963e-08, "logits/chosen": -2.0183398723602295, "logits/rejected": -2.268613576889038, "logps/chosen": -0.3160463571548462, "logps/rejected": -0.3685978949069977, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": 1.0736005306243896, "rewards/margins": 0.044812679290771484, "rewards/rejected": 1.0287878513336182, "step": 4497 }, { "epoch": 2.43, "learning_rate": 3.539893479559085e-08, "logits/chosen": -2.0147712230682373, "logits/rejected": -2.023560047149658, "logps/chosen": -1.8791109323501587, "logps/rejected": -2.0679562091827393, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 1.297735333442688, "rewards/margins": 0.651321530342102, "rewards/rejected": 0.6464138031005859, "step": 4498 }, { "epoch": 2.43, "learning_rate": 3.537805021412191e-08, "logits/chosen": -2.067969560623169, "logits/rejected": -2.2697079181671143, "logps/chosen": -0.7077029943466187, "logps/rejected": -0.740614652633667, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9503352046012878, "rewards/margins": 0.012368917465209961, "rewards/rejected": 0.9379662871360779, "step": 4499 }, { "epoch": 2.43, "learning_rate": 3.5357168421886694e-08, "logits/chosen": -2.039705514907837, "logits/rejected": -2.252307891845703, "logps/chosen": -10.124574661254883, "logps/rejected": -8.437698364257812, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.5306504368782043, "rewards/margins": 0.016153037548065186, "rewards/rejected": 0.5144973993301392, "step": 4500 }, { "epoch": 2.43, "learning_rate": 3.533628942286853e-08, "logits/chosen": -2.2231626510620117, "logits/rejected": -2.1305360794067383, "logps/chosen": -18.930133819580078, "logps/rejected": -2.12732195854187, "loss": 0.1505, "rewards/accuracies": 1.0, "rewards/chosen": 2.4902710914611816, "rewards/margins": 1.8178677558898926, "rewards/rejected": 0.6724032759666443, "step": 4501 }, { "epoch": 2.43, "learning_rate": 3.531541322105022e-08, "logits/chosen": -2.1392080783843994, "logits/rejected": -2.1429998874664307, "logps/chosen": -4.368514537811279, "logps/rejected": -0.4085424840450287, "loss": 0.572, "rewards/accuracies": 1.0, "rewards/chosen": 1.2313346862792969, "rewards/margins": 0.25897836685180664, "rewards/rejected": 0.9723563194274902, "step": 4502 }, { "epoch": 2.43, "learning_rate": 3.529453982041406e-08, "logits/chosen": -1.993911623954773, "logits/rejected": -2.038024663925171, "logps/chosen": -5.220151901245117, "logps/rejected": -7.859777450561523, "loss": 0.2951, "rewards/accuracies": 1.0, "rewards/chosen": 1.8137825727462769, "rewards/margins": 1.0691509246826172, "rewards/rejected": 0.7446317076683044, "step": 4503 }, { "epoch": 2.43, "learning_rate": 3.5273669224941774e-08, "logits/chosen": -2.039020299911499, "logits/rejected": -2.0352184772491455, "logps/chosen": -1.1489872932434082, "logps/rejected": -9.27901554107666, "loss": 0.3373, "rewards/accuracies": 1.0, "rewards/chosen": 1.0182923078536987, "rewards/margins": 0.9133954644203186, "rewards/rejected": 0.10489683598279953, "step": 4504 }, { "epoch": 2.43, "learning_rate": 3.525280143861453e-08, "logits/chosen": -2.0350475311279297, "logits/rejected": -2.226064443588257, "logps/chosen": -0.29619914293289185, "logps/rejected": -0.28090035915374756, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 0.8853058218955994, "rewards/margins": -0.006658315658569336, "rewards/rejected": 0.8919641375541687, "step": 4505 }, { "epoch": 2.43, "learning_rate": 3.523193646541306e-08, "logits/chosen": -2.0821540355682373, "logits/rejected": -2.323336601257324, "logps/chosen": -2.2655022144317627, "logps/rejected": -2.5047402381896973, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8742157220840454, "rewards/margins": 0.011143326759338379, "rewards/rejected": 0.863072395324707, "step": 4506 }, { "epoch": 2.43, "learning_rate": 3.5211074309317474e-08, "logits/chosen": -2.086750030517578, "logits/rejected": -2.3373947143554688, "logps/chosen": -0.7354329228401184, "logps/rejected": -2.3042244911193848, "loss": 0.6166, "rewards/accuracies": 1.0, "rewards/chosen": 0.9601802825927734, "rewards/margins": 0.15941637754440308, "rewards/rejected": 0.8007639050483704, "step": 4507 }, { "epoch": 2.43, "learning_rate": 3.519021497430735e-08, "logits/chosen": -2.0916144847869873, "logits/rejected": -2.080092191696167, "logps/chosen": -0.8436919450759888, "logps/rejected": -10.9452543258667, "loss": 0.5349, "rewards/accuracies": 1.0, "rewards/chosen": 1.0401023626327515, "rewards/margins": 0.34636980295181274, "rewards/rejected": 0.6937325596809387, "step": 4508 }, { "epoch": 2.43, "learning_rate": 3.516935846436178e-08, "logits/chosen": -2.05843186378479, "logits/rejected": -2.232952117919922, "logps/chosen": -0.10829465091228485, "logps/rejected": -0.12005306780338287, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750109910964966, "rewards/margins": 0.025119900703430176, "rewards/rejected": 0.9498910903930664, "step": 4509 }, { "epoch": 2.43, "learning_rate": 3.514850478345924e-08, "logits/chosen": -2.0596249103546143, "logits/rejected": -2.0608437061309814, "logps/chosen": -1.5856256484985352, "logps/rejected": -1.8362219333648682, "loss": 0.645, "rewards/accuracies": 1.0, "rewards/chosen": 1.2057807445526123, "rewards/margins": 0.09870576858520508, "rewards/rejected": 1.1070749759674072, "step": 4510 }, { "epoch": 2.43, "learning_rate": 3.512765393557773e-08, "logits/chosen": -2.0207571983337402, "logits/rejected": -2.020894765853882, "logps/chosen": -1.1905779838562012, "logps/rejected": -2.4667649269104004, "loss": 0.5128, "rewards/accuracies": 1.0, "rewards/chosen": 1.3098779916763306, "rewards/margins": 0.40041637420654297, "rewards/rejected": 0.9094616174697876, "step": 4511 }, { "epoch": 2.43, "learning_rate": 3.510680592469467e-08, "logits/chosen": -2.0717873573303223, "logits/rejected": -2.0197250843048096, "logps/chosen": -11.538204193115234, "logps/rejected": -3.1358232498168945, "loss": 0.2598, "rewards/accuracies": 1.0, "rewards/chosen": 1.9948128461837769, "rewards/margins": 1.2152528762817383, "rewards/rejected": 0.7795600295066833, "step": 4512 }, { "epoch": 2.43, "learning_rate": 3.508596075478698e-08, "logits/chosen": -2.094388008117676, "logits/rejected": -2.1054627895355225, "logps/chosen": -7.23678731918335, "logps/rejected": -7.712466239929199, "loss": 0.1877, "rewards/accuracies": 1.0, "rewards/chosen": 2.1709721088409424, "rewards/margins": 1.5776193141937256, "rewards/rejected": 0.5933527946472168, "step": 4513 }, { "epoch": 2.43, "learning_rate": 3.5065118429831e-08, "logits/chosen": -2.135315179824829, "logits/rejected": -2.1457979679107666, "logps/chosen": -2.186849355697632, "logps/rejected": -1.7132371664047241, "loss": 0.531, "rewards/accuracies": 1.0, "rewards/chosen": 1.3859293460845947, "rewards/margins": 0.3558269739151001, "rewards/rejected": 1.0301023721694946, "step": 4514 }, { "epoch": 2.44, "learning_rate": 3.5044278953802544e-08, "logits/chosen": -1.9887021780014038, "logits/rejected": -1.9978152513504028, "logps/chosen": -4.020578384399414, "logps/rejected": -5.20463752746582, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 1.7825924158096313, "rewards/margins": 1.3945863246917725, "rewards/rejected": 0.3880061209201813, "step": 4515 }, { "epoch": 2.44, "learning_rate": 3.502344233067686e-08, "logits/chosen": -2.0340471267700195, "logits/rejected": -2.0385007858276367, "logps/chosen": -3.086700677871704, "logps/rejected": -3.4943161010742188, "loss": 0.4062, "rewards/accuracies": 1.0, "rewards/chosen": 1.3415054082870483, "rewards/margins": 0.6909176707267761, "rewards/rejected": 0.6505877375602722, "step": 4516 }, { "epoch": 2.44, "learning_rate": 3.500260856442871e-08, "logits/chosen": -2.199205160140991, "logits/rejected": -2.2903828620910645, "logps/chosen": -0.6755402088165283, "logps/rejected": -0.7407923936843872, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.9319440126419067, "rewards/margins": 0.029556691646575928, "rewards/rejected": 0.9023873209953308, "step": 4517 }, { "epoch": 2.44, "learning_rate": 3.498177765903225e-08, "logits/chosen": -2.0192577838897705, "logits/rejected": -2.0015625953674316, "logps/chosen": -28.88724708557129, "logps/rejected": -10.180392265319824, "loss": 0.1835, "rewards/accuracies": 1.0, "rewards/chosen": 1.8014612197875977, "rewards/margins": 1.6025913953781128, "rewards/rejected": 0.19886980950832367, "step": 4518 }, { "epoch": 2.44, "learning_rate": 3.4960949618461124e-08, "logits/chosen": -1.9389694929122925, "logits/rejected": -1.9456583261489868, "logps/chosen": -3.1572985649108887, "logps/rejected": -4.893383502960205, "loss": 0.499, "rewards/accuracies": 1.0, "rewards/chosen": 0.9960002899169922, "rewards/margins": 0.4353899359703064, "rewards/rejected": 0.5606103539466858, "step": 4519 }, { "epoch": 2.44, "learning_rate": 3.494012444668841e-08, "logits/chosen": -2.038755416870117, "logits/rejected": -1.9826551675796509, "logps/chosen": -28.74720573425293, "logps/rejected": -3.5443122386932373, "loss": 0.1562, "rewards/accuracies": 1.0, "rewards/chosen": 2.30251407623291, "rewards/margins": 1.7776968479156494, "rewards/rejected": 0.5248172879219055, "step": 4520 }, { "epoch": 2.44, "learning_rate": 3.4919302147686647e-08, "logits/chosen": -2.057844400405884, "logits/rejected": -2.2872002124786377, "logps/chosen": -0.24505135416984558, "logps/rejected": -0.33034175634384155, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.9829002618789673, "rewards/margins": 0.0062386393547058105, "rewards/rejected": 0.9766616225242615, "step": 4521 }, { "epoch": 2.44, "learning_rate": 3.489848272542782e-08, "logits/chosen": -2.059096574783325, "logits/rejected": -2.2966277599334717, "logps/chosen": -0.3100706934928894, "logps/rejected": -0.31446874141693115, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8916838765144348, "rewards/margins": 0.013509094715118408, "rewards/rejected": 0.8781747817993164, "step": 4522 }, { "epoch": 2.44, "learning_rate": 3.48776661838834e-08, "logits/chosen": -2.143629789352417, "logits/rejected": -2.1386051177978516, "logps/chosen": -4.771892070770264, "logps/rejected": -6.8667216300964355, "loss": 0.3561, "rewards/accuracies": 1.0, "rewards/chosen": 1.0839357376098633, "rewards/margins": 0.8492506742477417, "rewards/rejected": 0.23468509316444397, "step": 4523 }, { "epoch": 2.44, "learning_rate": 3.485685252702425e-08, "logits/chosen": -2.3714094161987305, "logits/rejected": -2.3122670650482178, "logps/chosen": -19.16958236694336, "logps/rejected": -6.384219646453857, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 2.5009334087371826, "rewards/margins": 2.161325454711914, "rewards/rejected": 0.33960795402526855, "step": 4524 }, { "epoch": 2.44, "learning_rate": 3.4836041758820724e-08, "logits/chosen": -2.1472766399383545, "logits/rejected": -2.1492435932159424, "logps/chosen": -2.035971164703369, "logps/rejected": -1.457111120223999, "loss": 0.6437, "rewards/accuracies": 1.0, "rewards/chosen": 1.05182683467865, "rewards/margins": 0.10152524709701538, "rewards/rejected": 0.9503015875816345, "step": 4525 }, { "epoch": 2.44, "learning_rate": 3.481523388324262e-08, "logits/chosen": -2.0924017429351807, "logits/rejected": -2.227844476699829, "logps/chosen": -3.9116506576538086, "logps/rejected": -0.2814888656139374, "loss": 0.6513, "rewards/accuracies": 1.0, "rewards/chosen": 0.844414234161377, "rewards/margins": 0.08552229404449463, "rewards/rejected": 0.7588919401168823, "step": 4526 }, { "epoch": 2.44, "learning_rate": 3.4794428904259146e-08, "logits/chosen": -2.0629162788391113, "logits/rejected": -2.056117296218872, "logps/chosen": -3.7320773601531982, "logps/rejected": -1.114248275756836, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 2.2610299587249756, "rewards/margins": 1.37103271484375, "rewards/rejected": 0.8899973034858704, "step": 4527 }, { "epoch": 2.44, "learning_rate": 3.477362682583903e-08, "logits/chosen": -1.9716464281082153, "logits/rejected": -1.9671393632888794, "logps/chosen": -1.099257230758667, "logps/rejected": -3.704610824584961, "loss": 0.6396, "rewards/accuracies": 1.0, "rewards/chosen": 0.9174246191978455, "rewards/margins": 0.1101982593536377, "rewards/rejected": 0.8072263598442078, "step": 4528 }, { "epoch": 2.44, "learning_rate": 3.4752827651950395e-08, "logits/chosen": -2.1217598915100098, "logits/rejected": -2.319265842437744, "logps/chosen": -1.078197956085205, "logps/rejected": -1.0513808727264404, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 1.0955355167388916, "rewards/margins": 0.020970940589904785, "rewards/rejected": 1.0745645761489868, "step": 4529 }, { "epoch": 2.44, "learning_rate": 3.473203138656081e-08, "logits/chosen": -2.1692051887512207, "logits/rejected": -2.133695363998413, "logps/chosen": -13.26701545715332, "logps/rejected": -6.428255558013916, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 1.767643928527832, "rewards/margins": 0.9470117688179016, "rewards/rejected": 0.8206321597099304, "step": 4530 }, { "epoch": 2.44, "learning_rate": 3.4711238033637315e-08, "logits/chosen": -2.1059746742248535, "logits/rejected": -2.108292818069458, "logps/chosen": -2.2925727367401123, "logps/rejected": -3.226487874984741, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 1.1902745962142944, "rewards/margins": 0.3280262351036072, "rewards/rejected": 0.8622483611106873, "step": 4531 }, { "epoch": 2.44, "learning_rate": 3.4690447597146384e-08, "logits/chosen": -2.0268490314483643, "logits/rejected": -2.280322551727295, "logps/chosen": -0.8624642491340637, "logps/rejected": -0.8069546222686768, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 1.0540509223937988, "rewards/margins": 0.0421220064163208, "rewards/rejected": 1.011928915977478, "step": 4532 }, { "epoch": 2.44, "learning_rate": 3.4669660081053896e-08, "logits/chosen": -2.15093731880188, "logits/rejected": -2.230867624282837, "logps/chosen": -6.341421127319336, "logps/rejected": -5.3915205001831055, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": 0.9825334548950195, "rewards/margins": 0.0448647141456604, "rewards/rejected": 0.9376687407493591, "step": 4533 }, { "epoch": 2.45, "learning_rate": 3.464887548932524e-08, "logits/chosen": -2.222621202468872, "logits/rejected": -2.276979684829712, "logps/chosen": -5.032717704772949, "logps/rejected": -17.30629539489746, "loss": 0.3183, "rewards/accuracies": 1.0, "rewards/chosen": 1.4678502082824707, "rewards/margins": 0.9812502861022949, "rewards/rejected": 0.4865999221801758, "step": 4534 }, { "epoch": 2.45, "learning_rate": 3.46280938259252e-08, "logits/chosen": -2.1244497299194336, "logits/rejected": -2.1103460788726807, "logps/chosen": -3.2277512550354004, "logps/rejected": -5.272777557373047, "loss": 0.3012, "rewards/accuracies": 1.0, "rewards/chosen": 1.747121810913086, "rewards/margins": 1.0455244779586792, "rewards/rejected": 0.7015973329544067, "step": 4535 }, { "epoch": 2.45, "learning_rate": 3.4607315094818024e-08, "logits/chosen": -2.0872559547424316, "logits/rejected": -2.298715591430664, "logps/chosen": -0.7965343594551086, "logps/rejected": -0.8148205876350403, "loss": 0.7328, "rewards/accuracies": 0.0, "rewards/chosen": 0.8342856764793396, "rewards/margins": -0.0778878927230835, "rewards/rejected": 0.9121735692024231, "step": 4536 }, { "epoch": 2.45, "learning_rate": 3.458653929996739e-08, "logits/chosen": -2.1637067794799805, "logits/rejected": -2.163543701171875, "logps/chosen": -2.5614280700683594, "logps/rejected": -8.039617538452148, "loss": 0.3047, "rewards/accuracies": 1.0, "rewards/chosen": 1.0857117176055908, "rewards/margins": 1.0320823192596436, "rewards/rejected": 0.053629398345947266, "step": 4537 }, { "epoch": 2.45, "learning_rate": 3.4565766445336385e-08, "logits/chosen": -2.091599702835083, "logits/rejected": -2.2463552951812744, "logps/chosen": -0.34795671701431274, "logps/rejected": -0.3442583680152893, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.9850060343742371, "rewards/margins": 0.0009267330169677734, "rewards/rejected": 0.9840793013572693, "step": 4538 }, { "epoch": 2.45, "learning_rate": 3.4544996534887636e-08, "logits/chosen": -1.9515926837921143, "logits/rejected": -1.950419306755066, "logps/chosen": -1.0847957134246826, "logps/rejected": -1.3917651176452637, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9777532815933228, "rewards/margins": 0.016210198402404785, "rewards/rejected": 0.961543083190918, "step": 4539 }, { "epoch": 2.45, "learning_rate": 3.45242295725831e-08, "logits/chosen": -2.113790512084961, "logits/rejected": -2.377405881881714, "logps/chosen": -15.432337760925293, "logps/rejected": -13.072980880737305, "loss": 0.7659, "rewards/accuracies": 0.0, "rewards/chosen": 1.2150403261184692, "rewards/margins": -0.1406468152999878, "rewards/rejected": 1.355687141418457, "step": 4540 }, { "epoch": 2.45, "learning_rate": 3.450346556238422e-08, "logits/chosen": -2.097391128540039, "logits/rejected": -2.1057732105255127, "logps/chosen": -2.2315359115600586, "logps/rejected": -6.780024528503418, "loss": 0.3378, "rewards/accuracies": 1.0, "rewards/chosen": 1.1736645698547363, "rewards/margins": 0.9116174578666687, "rewards/rejected": 0.2620471119880676, "step": 4541 }, { "epoch": 2.45, "learning_rate": 3.448270450825185e-08, "logits/chosen": -2.0168375968933105, "logits/rejected": -2.3343021869659424, "logps/chosen": -0.7951797842979431, "logps/rejected": -0.7771204113960266, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.8756009936332703, "rewards/margins": 0.026672720909118652, "rewards/rejected": 0.8489282727241516, "step": 4542 }, { "epoch": 2.45, "learning_rate": 3.446194641414633e-08, "logits/chosen": -2.049346923828125, "logits/rejected": -2.0555315017700195, "logps/chosen": -1.206599235534668, "logps/rejected": -5.945240497589111, "loss": 0.4184, "rewards/accuracies": 1.0, "rewards/chosen": 1.181638479232788, "rewards/margins": 0.654879093170166, "rewards/rejected": 0.5267593860626221, "step": 4543 }, { "epoch": 2.45, "learning_rate": 3.444119128402738e-08, "logits/chosen": -2.0681772232055664, "logits/rejected": -2.0730438232421875, "logps/chosen": -0.8155798316001892, "logps/rejected": -17.40935516357422, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 1.08174467086792, "rewards/margins": 0.03520643711090088, "rewards/rejected": 1.046538233757019, "step": 4544 }, { "epoch": 2.45, "learning_rate": 3.44204391218542e-08, "logits/chosen": -1.9962514638900757, "logits/rejected": -1.9495232105255127, "logps/chosen": -9.893568992614746, "logps/rejected": -1.5741864442825317, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 1.5074610710144043, "rewards/margins": 0.551791250705719, "rewards/rejected": 0.9556698203086853, "step": 4545 }, { "epoch": 2.45, "learning_rate": 3.439968993158537e-08, "logits/chosen": -2.169058322906494, "logits/rejected": -2.1963958740234375, "logps/chosen": -0.8453956842422485, "logps/rejected": -8.577749252319336, "loss": 0.4511, "rewards/accuracies": 1.0, "rewards/chosen": 1.100152850151062, "rewards/margins": 0.5621151328086853, "rewards/rejected": 0.5380377173423767, "step": 4546 }, { "epoch": 2.45, "learning_rate": 3.437894371717896e-08, "logits/chosen": -2.0615010261535645, "logits/rejected": -2.0482470989227295, "logps/chosen": -6.232240676879883, "logps/rejected": -3.1149067878723145, "loss": 0.4025, "rewards/accuracies": 1.0, "rewards/chosen": 1.4845316410064697, "rewards/margins": 0.7022174000740051, "rewards/rejected": 0.7823142409324646, "step": 4547 }, { "epoch": 2.45, "learning_rate": 3.4358200482592456e-08, "logits/chosen": -2.2262439727783203, "logits/rejected": -2.3289573192596436, "logps/chosen": -0.5214862823486328, "logps/rejected": -0.5542751550674438, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 1.037007212638855, "rewards/margins": -3.1828880310058594e-05, "rewards/rejected": 1.037039041519165, "step": 4548 }, { "epoch": 2.45, "learning_rate": 3.433746023178273e-08, "logits/chosen": -2.0694382190704346, "logits/rejected": -2.2835536003112793, "logps/chosen": -0.38435909152030945, "logps/rejected": -0.3341282308101654, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.9724260568618774, "rewards/margins": 0.040597379207611084, "rewards/rejected": 0.9318286776542664, "step": 4549 }, { "epoch": 2.45, "learning_rate": 3.431672296870616e-08, "logits/chosen": -1.960668921470642, "logits/rejected": -2.2577271461486816, "logps/chosen": -1.2895379066467285, "logps/rejected": -1.1517308950424194, "loss": 0.7021, "rewards/accuracies": 0.0, "rewards/chosen": 0.972355842590332, "rewards/margins": -0.01778954267501831, "rewards/rejected": 0.9901453852653503, "step": 4550 }, { "epoch": 2.45, "learning_rate": 3.42959886973185e-08, "logits/chosen": -2.0524742603302, "logits/rejected": -2.0554473400115967, "logps/chosen": -1.3105992078781128, "logps/rejected": -1.576109766960144, "loss": 0.5706, "rewards/accuracies": 1.0, "rewards/chosen": 0.9925494194030762, "rewards/margins": 0.2621936798095703, "rewards/rejected": 0.7303557395935059, "step": 4551 }, { "epoch": 2.46, "learning_rate": 3.4275257421574955e-08, "logits/chosen": -2.079502820968628, "logits/rejected": -2.291177749633789, "logps/chosen": -2.900465965270996, "logps/rejected": -8.864372253417969, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 0.8247434496879578, "rewards/margins": -0.017223000526428223, "rewards/rejected": 0.841966450214386, "step": 4552 }, { "epoch": 2.46, "learning_rate": 3.425452914543015e-08, "logits/chosen": -2.070819139480591, "logits/rejected": -2.2899186611175537, "logps/chosen": -1.1049792766571045, "logps/rejected": -1.069693684577942, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 1.1229504346847534, "rewards/margins": 0.008354425430297852, "rewards/rejected": 1.1145960092544556, "step": 4553 }, { "epoch": 2.46, "learning_rate": 3.423380387283814e-08, "logits/chosen": -2.0874133110046387, "logits/rejected": -2.3135273456573486, "logps/chosen": -1.1969224214553833, "logps/rejected": -1.1437630653381348, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 1.072285532951355, "rewards/margins": 0.030520081520080566, "rewards/rejected": 1.0417654514312744, "step": 4554 }, { "epoch": 2.46, "learning_rate": 3.4213081607752406e-08, "logits/chosen": -2.044969320297241, "logits/rejected": -2.047330141067505, "logps/chosen": -1.4992798566818237, "logps/rejected": -1.5619170665740967, "loss": 0.5259, "rewards/accuracies": 1.0, "rewards/chosen": 1.0770610570907593, "rewards/margins": 0.36821073293685913, "rewards/rejected": 0.7088503241539001, "step": 4555 }, { "epoch": 2.46, "learning_rate": 3.419236235412587e-08, "logits/chosen": -2.1566975116729736, "logits/rejected": -2.155510187149048, "logps/chosen": -2.700751543045044, "logps/rejected": -2.7471330165863037, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 1.4356344938278198, "rewards/margins": 0.6724793910980225, "rewards/rejected": 0.7631551027297974, "step": 4556 }, { "epoch": 2.46, "learning_rate": 3.417164611591085e-08, "logits/chosen": -2.1174380779266357, "logits/rejected": -2.268577814102173, "logps/chosen": -0.3017390966415405, "logps/rejected": -0.3000478148460388, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.8539502024650574, "rewards/margins": 0.029160380363464355, "rewards/rejected": 0.824789822101593, "step": 4557 }, { "epoch": 2.46, "learning_rate": 3.415093289705911e-08, "logits/chosen": -2.033795118331909, "logits/rejected": -2.016073226928711, "logps/chosen": -6.629900932312012, "logps/rejected": -5.2174177169799805, "loss": 0.3246, "rewards/accuracies": 1.0, "rewards/chosen": 1.4060227870941162, "rewards/margins": 0.9584545493125916, "rewards/rejected": 0.44756823778152466, "step": 4558 }, { "epoch": 2.46, "learning_rate": 3.413022270152184e-08, "logits/chosen": -2.211580753326416, "logits/rejected": -2.297271966934204, "logps/chosen": -5.589658737182617, "logps/rejected": -1.4550166130065918, "loss": 0.8001, "rewards/accuracies": 0.0, "rewards/chosen": 1.0601481199264526, "rewards/margins": -0.20352816581726074, "rewards/rejected": 1.2636762857437134, "step": 4559 }, { "epoch": 2.46, "learning_rate": 3.410951553324963e-08, "logits/chosen": -2.0690605640411377, "logits/rejected": -2.2634220123291016, "logps/chosen": -1.8453571796417236, "logps/rejected": -1.9475221633911133, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.7125051617622375, "rewards/margins": 0.03596287965774536, "rewards/rejected": 0.6765422821044922, "step": 4560 }, { "epoch": 2.46, "learning_rate": 3.408881139619251e-08, "logits/chosen": -2.0449771881103516, "logits/rejected": -2.281381607055664, "logps/chosen": -0.45290544629096985, "logps/rejected": -0.47184300422668457, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 1.0295093059539795, "rewards/margins": 0.02214038372039795, "rewards/rejected": 1.0073689222335815, "step": 4561 }, { "epoch": 2.46, "learning_rate": 3.4068110294299976e-08, "logits/chosen": -1.9782934188842773, "logits/rejected": -1.99332594871521, "logps/chosen": -1.210558533668518, "logps/rejected": -7.594789028167725, "loss": 0.3696, "rewards/accuracies": 1.0, "rewards/chosen": 1.3407419919967651, "rewards/margins": 0.8047398924827576, "rewards/rejected": 0.5360020995140076, "step": 4562 }, { "epoch": 2.46, "learning_rate": 3.404741223152086e-08, "logits/chosen": -2.092622756958008, "logits/rejected": -2.0784225463867188, "logps/chosen": -0.6971478462219238, "logps/rejected": -6.709613800048828, "loss": 0.3908, "rewards/accuracies": 1.0, "rewards/chosen": 1.0879433155059814, "rewards/margins": 0.7377442121505737, "rewards/rejected": 0.3501991331577301, "step": 4563 }, { "epoch": 2.46, "learning_rate": 3.402671721180345e-08, "logits/chosen": -2.239591121673584, "logits/rejected": -2.063206911087036, "logps/chosen": -45.92510223388672, "logps/rejected": -10.721746444702148, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 2.8138558864593506, "rewards/margins": 1.8034589290618896, "rewards/rejected": 1.010396957397461, "step": 4564 }, { "epoch": 2.46, "learning_rate": 3.400602523909548e-08, "logits/chosen": -2.21488094329834, "logits/rejected": -2.246852397918701, "logps/chosen": -5.440607070922852, "logps/rejected": -23.035850524902344, "loss": 0.7942, "rewards/accuracies": 0.0, "rewards/chosen": 1.095461130142212, "rewards/margins": -0.19282281398773193, "rewards/rejected": 1.2882839441299438, "step": 4565 }, { "epoch": 2.46, "learning_rate": 3.398533631734407e-08, "logits/chosen": -2.0311264991760254, "logits/rejected": -2.2763655185699463, "logps/chosen": -0.19018027186393738, "logps/rejected": -0.2327100932598114, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": 0.8917016983032227, "rewards/margins": -0.021978676319122314, "rewards/rejected": 0.913680374622345, "step": 4566 }, { "epoch": 2.46, "learning_rate": 3.3964650450495764e-08, "logits/chosen": -2.0673911571502686, "logits/rejected": -2.3018064498901367, "logps/chosen": -0.8554239273071289, "logps/rejected": -0.8845348358154297, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9550638198852539, "rewards/margins": 0.017055869102478027, "rewards/rejected": 0.9380079507827759, "step": 4567 }, { "epoch": 2.46, "learning_rate": 3.3943967642496535e-08, "logits/chosen": -2.015498399734497, "logits/rejected": -2.3150131702423096, "logps/chosen": -0.2943517863750458, "logps/rejected": -0.2846277058124542, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.9922721982002258, "rewards/margins": 0.024656951427459717, "rewards/rejected": 0.9676152467727661, "step": 4568 }, { "epoch": 2.46, "learning_rate": 3.392328789729176e-08, "logits/chosen": -2.140638589859009, "logits/rejected": -2.315626382827759, "logps/chosen": -3.368969440460205, "logps/rejected": -3.700730800628662, "loss": 0.8308, "rewards/accuracies": 0.0, "rewards/chosen": 0.8790329098701477, "rewards/margins": -0.2586624026298523, "rewards/rejected": 1.1376953125, "step": 4569 }, { "epoch": 2.46, "learning_rate": 3.390261121882625e-08, "logits/chosen": -2.0600500106811523, "logits/rejected": -2.280731678009033, "logps/chosen": -0.3581632673740387, "logps/rejected": -0.4384060502052307, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.91473788022995, "rewards/margins": -0.005522191524505615, "rewards/rejected": 0.9202600717544556, "step": 4570 }, { "epoch": 2.47, "learning_rate": 3.388193761104421e-08, "logits/chosen": -2.0327885150909424, "logits/rejected": -2.3361241817474365, "logps/chosen": -1.3530714511871338, "logps/rejected": -4.293238639831543, "loss": 0.584, "rewards/accuracies": 1.0, "rewards/chosen": 0.9380208849906921, "rewards/margins": 0.2317260503768921, "rewards/rejected": 0.7062948346138, "step": 4571 }, { "epoch": 2.47, "learning_rate": 3.3861267077889256e-08, "logits/chosen": -2.1692049503326416, "logits/rejected": -2.1342549324035645, "logps/chosen": -4.575470447540283, "logps/rejected": -9.197463035583496, "loss": 0.2495, "rewards/accuracies": 1.0, "rewards/chosen": 1.3365951776504517, "rewards/margins": 1.2610243558883667, "rewards/rejected": 0.07557087391614914, "step": 4572 }, { "epoch": 2.47, "learning_rate": 3.384059962330445e-08, "logits/chosen": -2.0466091632843018, "logits/rejected": -2.3366096019744873, "logps/chosen": -0.4569040536880493, "logps/rejected": -0.48067834973335266, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.845273494720459, "rewards/margins": 0.015553295612335205, "rewards/rejected": 0.8297201991081238, "step": 4573 }, { "epoch": 2.47, "learning_rate": 3.3819935251232226e-08, "logits/chosen": -2.097158193588257, "logits/rejected": -2.097473621368408, "logps/chosen": -1.7367982864379883, "logps/rejected": -0.5486942529678345, "loss": 0.383, "rewards/accuracies": 1.0, "rewards/chosen": 1.525865077972412, "rewards/margins": 0.7621018886566162, "rewards/rejected": 0.7637631893157959, "step": 4574 }, { "epoch": 2.47, "learning_rate": 3.3799273965614464e-08, "logits/chosen": -2.001875400543213, "logits/rejected": -2.228562355041504, "logps/chosen": -0.3920808434486389, "logps/rejected": -0.3290645480155945, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.805252730846405, "rewards/margins": 0.017579317092895508, "rewards/rejected": 0.7876734137535095, "step": 4575 }, { "epoch": 2.47, "learning_rate": 3.377861577039244e-08, "logits/chosen": -2.080965518951416, "logits/rejected": -2.0753915309906006, "logps/chosen": -7.386578559875488, "logps/rejected": -10.070945739746094, "loss": 0.2431, "rewards/accuracies": 1.0, "rewards/chosen": 1.5959407091140747, "rewards/margins": 1.2903268337249756, "rewards/rejected": 0.3056139051914215, "step": 4576 }, { "epoch": 2.47, "learning_rate": 3.375796066950684e-08, "logits/chosen": -2.084033966064453, "logits/rejected": -2.325566530227661, "logps/chosen": -0.1833615005016327, "logps/rejected": -0.23034344613552094, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.8776288032531738, "rewards/margins": 0.00031870603561401367, "rewards/rejected": 0.8773100972175598, "step": 4577 }, { "epoch": 2.47, "learning_rate": 3.373730866689776e-08, "logits/chosen": -2.0018675327301025, "logits/rejected": -1.9338111877441406, "logps/chosen": -13.773099899291992, "logps/rejected": -1.7457488775253296, "loss": 0.2876, "rewards/accuracies": 1.0, "rewards/chosen": 1.982055902481079, "rewards/margins": 1.0990855693817139, "rewards/rejected": 0.8829702734947205, "step": 4578 }, { "epoch": 2.47, "learning_rate": 3.3716659766504715e-08, "logits/chosen": -2.001979351043701, "logits/rejected": -2.0106728076934814, "logps/chosen": -3.147805690765381, "logps/rejected": -1.5681508779525757, "loss": 0.62, "rewards/accuracies": 1.0, "rewards/chosen": 1.0989850759506226, "rewards/margins": 0.1521233320236206, "rewards/rejected": 0.946861743927002, "step": 4579 }, { "epoch": 2.47, "learning_rate": 3.36960139722666e-08, "logits/chosen": -2.015512704849243, "logits/rejected": -2.2864253520965576, "logps/chosen": -0.11545400321483612, "logps/rejected": -0.1150033175945282, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9346750378608704, "rewards/margins": 0.004813790321350098, "rewards/rejected": 0.9298612475395203, "step": 4580 }, { "epoch": 2.47, "learning_rate": 3.367537128812174e-08, "logits/chosen": -2.1418676376342773, "logits/rejected": -2.3275303840637207, "logps/chosen": -0.5371959209442139, "logps/rejected": -0.5113557577133179, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.9801980257034302, "rewards/margins": 0.034227967262268066, "rewards/rejected": 0.9459700584411621, "step": 4581 }, { "epoch": 2.47, "learning_rate": 3.3654731718007876e-08, "logits/chosen": -2.1485962867736816, "logits/rejected": -2.149775505065918, "logps/chosen": -0.1768750250339508, "logps/rejected": -7.305181980133057, "loss": 0.3862, "rewards/accuracies": 1.0, "rewards/chosen": 0.9714142084121704, "rewards/margins": 0.7519857287406921, "rewards/rejected": 0.21942849457263947, "step": 4582 }, { "epoch": 2.47, "learning_rate": 3.363409526586212e-08, "logits/chosen": -2.104062557220459, "logits/rejected": -2.3025450706481934, "logps/chosen": -3.4863390922546387, "logps/rejected": -1.6163705587387085, "loss": 0.7457, "rewards/accuracies": 0.0, "rewards/chosen": 0.9215623736381531, "rewards/margins": -0.10254305601119995, "rewards/rejected": 1.024105429649353, "step": 4583 }, { "epoch": 2.47, "learning_rate": 3.361346193562105e-08, "logits/chosen": -1.9983636140823364, "logits/rejected": -2.006868600845337, "logps/chosen": -1.4141173362731934, "logps/rejected": -2.9214067459106445, "loss": 0.4955, "rewards/accuracies": 1.0, "rewards/chosen": 1.0626661777496338, "rewards/margins": 0.4441891312599182, "rewards/rejected": 0.6184770464897156, "step": 4584 }, { "epoch": 2.47, "learning_rate": 3.359283173122059e-08, "logits/chosen": -2.073946952819824, "logits/rejected": -2.268045663833618, "logps/chosen": -0.34358420968055725, "logps/rejected": -0.3209047317504883, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8428744673728943, "rewards/margins": 0.018328607082366943, "rewards/rejected": 0.8245458602905273, "step": 4585 }, { "epoch": 2.47, "learning_rate": 3.357220465659609e-08, "logits/chosen": -2.059964656829834, "logits/rejected": -2.3077187538146973, "logps/chosen": -0.12471785396337509, "logps/rejected": -0.10878859460353851, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.7352965474128723, "rewards/margins": 0.015384793281555176, "rewards/rejected": 0.7199117541313171, "step": 4586 }, { "epoch": 2.47, "learning_rate": 3.3551580715682284e-08, "logits/chosen": -2.0341968536376953, "logits/rejected": -2.25661563873291, "logps/chosen": -0.13744322955608368, "logps/rejected": -0.17102523148059845, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.995635986328125, "rewards/margins": 0.0012064576148986816, "rewards/rejected": 0.9944295287132263, "step": 4587 }, { "epoch": 2.47, "learning_rate": 3.353095991241335e-08, "logits/chosen": -2.020866632461548, "logits/rejected": -2.225449562072754, "logps/chosen": -0.678252100944519, "logps/rejected": -0.6231083273887634, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.9520365595817566, "rewards/margins": 0.009822726249694824, "rewards/rejected": 0.9422138333320618, "step": 4588 }, { "epoch": 2.48, "learning_rate": 3.3510342250722834e-08, "logits/chosen": -2.105546236038208, "logits/rejected": -1.9890214204788208, "logps/chosen": -24.1090145111084, "logps/rejected": -4.366123199462891, "loss": 0.4014, "rewards/accuracies": 1.0, "rewards/chosen": 1.6888856887817383, "rewards/margins": 0.7053305506706238, "rewards/rejected": 0.9835551381111145, "step": 4589 }, { "epoch": 2.48, "learning_rate": 3.348972773454368e-08, "logits/chosen": -2.168584108352661, "logits/rejected": -2.332026243209839, "logps/chosen": -0.5392385721206665, "logps/rejected": -2.838623046875, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 1.0518732070922852, "rewards/margins": 0.0217512845993042, "rewards/rejected": 1.030121922492981, "step": 4590 }, { "epoch": 2.48, "learning_rate": 3.3469116367808266e-08, "logits/chosen": -2.145113945007324, "logits/rejected": -2.1454432010650635, "logps/chosen": -2.0196876525878906, "logps/rejected": -2.373758316040039, "loss": 0.6047, "rewards/accuracies": 1.0, "rewards/chosen": 0.9015556573867798, "rewards/margins": 0.18545860052108765, "rewards/rejected": 0.7160970568656921, "step": 4591 }, { "epoch": 2.48, "learning_rate": 3.344850815444833e-08, "logits/chosen": -2.15330171585083, "logits/rejected": -2.2825655937194824, "logps/chosen": -2.1456518173217773, "logps/rejected": -2.298574209213257, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.7098190188407898, "rewards/margins": 0.024633288383483887, "rewards/rejected": 0.6851857304573059, "step": 4592 }, { "epoch": 2.48, "learning_rate": 3.3427903098395025e-08, "logits/chosen": -2.12813663482666, "logits/rejected": -2.102935314178467, "logps/chosen": -15.792165756225586, "logps/rejected": -9.759718894958496, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": 1.8227208852767944, "rewards/margins": 1.7360049486160278, "rewards/rejected": 0.08671589195728302, "step": 4593 }, { "epoch": 2.48, "learning_rate": 3.34073012035789e-08, "logits/chosen": -2.143995761871338, "logits/rejected": -2.136838436126709, "logps/chosen": -0.7670472860336304, "logps/rejected": -10.072698593139648, "loss": 0.3215, "rewards/accuracies": 1.0, "rewards/chosen": 1.1895807981491089, "rewards/margins": 0.9697980880737305, "rewards/rejected": 0.2197827398777008, "step": 4594 }, { "epoch": 2.48, "learning_rate": 3.338670247392992e-08, "logits/chosen": -2.032313346862793, "logits/rejected": -2.032517671585083, "logps/chosen": -0.2086617350578308, "logps/rejected": -5.51338529586792, "loss": 0.424, "rewards/accuracies": 1.0, "rewards/chosen": 0.9482671618461609, "rewards/margins": 0.6384745836257935, "rewards/rejected": 0.30979257822036743, "step": 4595 }, { "epoch": 2.48, "learning_rate": 3.3366106913377415e-08, "logits/chosen": -2.046116590499878, "logits/rejected": -2.347440004348755, "logps/chosen": -2.1773500442504883, "logps/rejected": -1.8410766124725342, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9349652528762817, "rewards/margins": 0.008407413959503174, "rewards/rejected": 0.9265578389167786, "step": 4596 }, { "epoch": 2.48, "learning_rate": 3.334551452585012e-08, "logits/chosen": -2.1889455318450928, "logits/rejected": -2.333235740661621, "logps/chosen": -9.839290618896484, "logps/rejected": -12.708392143249512, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 1.157111406326294, "rewards/margins": 0.021592259407043457, "rewards/rejected": 1.1355191469192505, "step": 4597 }, { "epoch": 2.48, "learning_rate": 3.3324925315276186e-08, "logits/chosen": -2.051628828048706, "logits/rejected": -2.2738492488861084, "logps/chosen": -2.174792528152466, "logps/rejected": -2.138038396835327, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.8751100897789001, "rewards/margins": -0.0009375214576721191, "rewards/rejected": 0.8760476112365723, "step": 4598 }, { "epoch": 2.48, "learning_rate": 3.3304339285583124e-08, "logits/chosen": -2.0446617603302, "logits/rejected": -2.0438008308410645, "logps/chosen": -3.172806978225708, "logps/rejected": -3.0566747188568115, "loss": 0.4299, "rewards/accuracies": 1.0, "rewards/chosen": 1.4065874814987183, "rewards/margins": 0.6214255094528198, "rewards/rejected": 0.7851619720458984, "step": 4599 }, { "epoch": 2.48, "learning_rate": 3.328375644069787e-08, "logits/chosen": -2.0363030433654785, "logits/rejected": -2.0286190509796143, "logps/chosen": -2.1145999431610107, "logps/rejected": -6.047695636749268, "loss": 0.3195, "rewards/accuracies": 1.0, "rewards/chosen": 1.317060947418213, "rewards/margins": 0.976955771446228, "rewards/rejected": 0.34010520577430725, "step": 4600 }, { "epoch": 2.48, "learning_rate": 3.326317678454673e-08, "logits/chosen": -2.177901268005371, "logits/rejected": -2.173476219177246, "logps/chosen": -7.422040939331055, "logps/rejected": -6.742015838623047, "loss": 0.3327, "rewards/accuracies": 1.0, "rewards/chosen": 1.226051926612854, "rewards/margins": 0.9296806454658508, "rewards/rejected": 0.2963712811470032, "step": 4601 }, { "epoch": 2.48, "learning_rate": 3.3242600321055424e-08, "logits/chosen": -2.2517647743225098, "logits/rejected": -2.4360687732696533, "logps/chosen": -10.122191429138184, "logps/rejected": -26.13759994506836, "loss": 0.6403, "rewards/accuracies": 1.0, "rewards/chosen": 1.1550530195236206, "rewards/margins": 0.10874903202056885, "rewards/rejected": 1.0463039875030518, "step": 4602 }, { "epoch": 2.48, "learning_rate": 3.3222027054149026e-08, "logits/chosen": -2.2157254219055176, "logits/rejected": -2.2112390995025635, "logps/chosen": -2.7697205543518066, "logps/rejected": -7.008369445800781, "loss": 0.3973, "rewards/accuracies": 1.0, "rewards/chosen": 1.0299561023712158, "rewards/margins": 0.7177844047546387, "rewards/rejected": 0.31217166781425476, "step": 4603 }, { "epoch": 2.48, "learning_rate": 3.3201456987752033e-08, "logits/chosen": -2.013262987136841, "logits/rejected": -2.288524627685547, "logps/chosen": -0.9555507898330688, "logps/rejected": -0.9240780472755432, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 1.0260131359100342, "rewards/margins": 0.011648774147033691, "rewards/rejected": 1.0143643617630005, "step": 4604 }, { "epoch": 2.48, "learning_rate": 3.3180890125788306e-08, "logits/chosen": -2.079166889190674, "logits/rejected": -2.2729740142822266, "logps/chosen": -1.2191492319107056, "logps/rejected": -1.291693091392517, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.887043297290802, "rewards/margins": 0.035153210163116455, "rewards/rejected": 0.8518900871276855, "step": 4605 }, { "epoch": 2.48, "learning_rate": 3.316032647218112e-08, "logits/chosen": -2.1003236770629883, "logits/rejected": -2.064324378967285, "logps/chosen": -10.591928482055664, "logps/rejected": -3.7502951622009277, "loss": 0.3062, "rewards/accuracies": 1.0, "rewards/chosen": 1.7223867177963257, "rewards/margins": 1.0265052318572998, "rewards/rejected": 0.6958815455436707, "step": 4606 }, { "epoch": 2.48, "learning_rate": 3.3139766030853144e-08, "logits/chosen": -2.195289134979248, "logits/rejected": -2.284797191619873, "logps/chosen": -0.4770556092262268, "logps/rejected": -0.46910467743873596, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 1.0720149278640747, "rewards/margins": 0.02027297019958496, "rewards/rejected": 1.0517419576644897, "step": 4607 }, { "epoch": 2.49, "learning_rate": 3.311920880572641e-08, "logits/chosen": -2.1063082218170166, "logits/rejected": -2.006065845489502, "logps/chosen": -4.587860584259033, "logps/rejected": -5.050236225128174, "loss": 0.3244, "rewards/accuracies": 1.0, "rewards/chosen": 1.7307382822036743, "rewards/margins": 0.9592506885528564, "rewards/rejected": 0.7714875936508179, "step": 4608 }, { "epoch": 2.49, "learning_rate": 3.309865480072236e-08, "logits/chosen": -2.064302444458008, "logits/rejected": -2.0593738555908203, "logps/chosen": -2.8834385871887207, "logps/rejected": -6.903263568878174, "loss": 0.2195, "rewards/accuracies": 1.0, "rewards/chosen": 1.5687487125396729, "rewards/margins": 1.4044378995895386, "rewards/rejected": 0.16431079804897308, "step": 4609 }, { "epoch": 2.49, "learning_rate": 3.307810401976177e-08, "logits/chosen": -2.0638928413391113, "logits/rejected": -2.0551187992095947, "logps/chosen": -2.748765468597412, "logps/rejected": -6.7917609214782715, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 1.3482122421264648, "rewards/margins": 1.0428683757781982, "rewards/rejected": 0.3053438663482666, "step": 4610 }, { "epoch": 2.49, "learning_rate": 3.305755646676485e-08, "logits/chosen": -2.1302340030670166, "logits/rejected": -2.2985191345214844, "logps/chosen": -1.0561788082122803, "logps/rejected": -1.0641013383865356, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.9574992060661316, "rewards/margins": 0.01740097999572754, "rewards/rejected": 0.940098226070404, "step": 4611 }, { "epoch": 2.49, "learning_rate": 3.30370121456512e-08, "logits/chosen": -2.0552213191986084, "logits/rejected": -2.2472071647644043, "logps/chosen": -2.4803574085235596, "logps/rejected": -2.501140832901001, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 0.6921046376228333, "rewards/margins": 0.046036601066589355, "rewards/rejected": 0.6460680365562439, "step": 4612 }, { "epoch": 2.49, "learning_rate": 3.3016471060339766e-08, "logits/chosen": -2.1376261711120605, "logits/rejected": -2.1341745853424072, "logps/chosen": -5.67623233795166, "logps/rejected": -2.898991107940674, "loss": 0.3517, "rewards/accuracies": 1.0, "rewards/chosen": 1.5235651731491089, "rewards/margins": 0.8639646768569946, "rewards/rejected": 0.6596004962921143, "step": 4613 }, { "epoch": 2.49, "learning_rate": 3.2995933214748907e-08, "logits/chosen": -2.0048391819000244, "logits/rejected": -2.010237455368042, "logps/chosen": -1.608484148979187, "logps/rejected": -3.8390934467315674, "loss": 0.4118, "rewards/accuracies": 1.0, "rewards/chosen": 1.1955604553222656, "rewards/margins": 0.6742967367172241, "rewards/rejected": 0.5212637186050415, "step": 4614 }, { "epoch": 2.49, "learning_rate": 3.297539861279634e-08, "logits/chosen": -2.0304298400878906, "logits/rejected": -2.0290091037750244, "logps/chosen": -1.7642533779144287, "logps/rejected": -5.099207878112793, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 1.529706358909607, "rewards/margins": 0.9208444952964783, "rewards/rejected": 0.6088618636131287, "step": 4615 }, { "epoch": 2.49, "learning_rate": 3.2954867258399186e-08, "logits/chosen": -2.1289119720458984, "logits/rejected": -2.2851321697235107, "logps/chosen": -0.6161984801292419, "logps/rejected": -0.7763345837593079, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 0.6942976713180542, "rewards/margins": 0.044993460178375244, "rewards/rejected": 0.649304211139679, "step": 4616 }, { "epoch": 2.49, "learning_rate": 3.293433915547393e-08, "logits/chosen": -2.0784215927124023, "logits/rejected": -2.0798254013061523, "logps/chosen": -0.30609792470932007, "logps/rejected": -5.473111152648926, "loss": 0.4261, "rewards/accuracies": 1.0, "rewards/chosen": 1.0737600326538086, "rewards/margins": 0.6324158906936646, "rewards/rejected": 0.44134417176246643, "step": 4617 }, { "epoch": 2.49, "learning_rate": 3.2913814307936465e-08, "logits/chosen": -2.1177101135253906, "logits/rejected": -2.2311716079711914, "logps/chosen": -17.035934448242188, "logps/rejected": -21.161842346191406, "loss": 0.283, "rewards/accuracies": 1.0, "rewards/chosen": 1.841050386428833, "rewards/margins": 1.1176477670669556, "rewards/rejected": 0.7234026193618774, "step": 4618 }, { "epoch": 2.49, "learning_rate": 3.289329271970202e-08, "logits/chosen": -2.190383195877075, "logits/rejected": -2.384422540664673, "logps/chosen": -0.46489375829696655, "logps/rejected": -0.4792467951774597, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 1.0852487087249756, "rewards/margins": 0.007380843162536621, "rewards/rejected": 1.077867865562439, "step": 4619 }, { "epoch": 2.49, "learning_rate": 3.287277439468523e-08, "logits/chosen": -2.2762932777404785, "logits/rejected": -2.218108654022217, "logps/chosen": -10.527435302734375, "logps/rejected": -9.635509490966797, "loss": 0.1887, "rewards/accuracies": 1.0, "rewards/chosen": 2.0899200439453125, "rewards/margins": 1.5715720653533936, "rewards/rejected": 0.5183479189872742, "step": 4620 }, { "epoch": 2.49, "learning_rate": 3.28522593368001e-08, "logits/chosen": -2.148023843765259, "logits/rejected": -2.3461101055145264, "logps/chosen": -1.9366874694824219, "logps/rejected": -1.7675305604934692, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.8640827536582947, "rewards/margins": -0.0016610026359558105, "rewards/rejected": 0.8657437562942505, "step": 4621 }, { "epoch": 2.49, "learning_rate": 3.283174754996002e-08, "logits/chosen": -2.10744309425354, "logits/rejected": -2.068148612976074, "logps/chosen": -5.455686092376709, "logps/rejected": -3.181232213973999, "loss": 0.3225, "rewards/accuracies": 1.0, "rewards/chosen": 1.5799015760421753, "rewards/margins": 0.9660711884498596, "rewards/rejected": 0.6138303875923157, "step": 4622 }, { "epoch": 2.49, "learning_rate": 3.281123903807774e-08, "logits/chosen": -2.0261945724487305, "logits/rejected": -2.0263285636901855, "logps/chosen": -1.6876945495605469, "logps/rejected": -5.726889610290527, "loss": 0.2961, "rewards/accuracies": 1.0, "rewards/chosen": 1.5462888479232788, "rewards/margins": 1.065199375152588, "rewards/rejected": 0.48108941316604614, "step": 4623 }, { "epoch": 2.49, "learning_rate": 3.279073380506539e-08, "logits/chosen": -2.082425594329834, "logits/rejected": -2.0806527137756348, "logps/chosen": -4.176357269287109, "logps/rejected": -3.9964687824249268, "loss": 0.5537, "rewards/accuracies": 1.0, "rewards/chosen": 0.8911043405532837, "rewards/margins": 0.3014675974845886, "rewards/rejected": 0.5896367430686951, "step": 4624 }, { "epoch": 2.49, "learning_rate": 3.27702318548345e-08, "logits/chosen": -2.137146472930908, "logits/rejected": -2.1286768913269043, "logps/chosen": -12.095711708068848, "logps/rejected": -1.1179165840148926, "loss": 0.4064, "rewards/accuracies": 1.0, "rewards/chosen": 1.4370936155319214, "rewards/margins": 0.6904283761978149, "rewards/rejected": 0.7466652393341064, "step": 4625 }, { "epoch": 2.5, "learning_rate": 3.274973319129594e-08, "logits/chosen": -2.139634609222412, "logits/rejected": -2.1397321224212646, "logps/chosen": -0.6666244268417358, "logps/rejected": -2.355360984802246, "loss": 0.4817, "rewards/accuracies": 1.0, "rewards/chosen": 1.2137972116470337, "rewards/margins": 0.4800402522087097, "rewards/rejected": 0.733756959438324, "step": 4626 }, { "epoch": 2.5, "learning_rate": 3.2729237818359956e-08, "logits/chosen": -2.098297357559204, "logits/rejected": -2.095198631286621, "logps/chosen": -2.9134390354156494, "logps/rejected": -3.399967908859253, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 1.1849182844161987, "rewards/margins": 0.5114263892173767, "rewards/rejected": 0.673491895198822, "step": 4627 }, { "epoch": 2.5, "learning_rate": 3.270874573993617e-08, "logits/chosen": -2.159599542617798, "logits/rejected": -2.370368719100952, "logps/chosen": -0.31757670640945435, "logps/rejected": -15.377969741821289, "loss": 0.7387, "rewards/accuracies": 0.0, "rewards/chosen": 0.8514439463615417, "rewards/margins": -0.08903837203979492, "rewards/rejected": 0.9404823184013367, "step": 4628 }, { "epoch": 2.5, "learning_rate": 3.2688256959933615e-08, "logits/chosen": -1.9718292951583862, "logits/rejected": -1.972098469734192, "logps/chosen": -6.315781593322754, "logps/rejected": -3.8558647632598877, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 1.5460561513900757, "rewards/margins": 0.9958665370941162, "rewards/rejected": 0.5501896142959595, "step": 4629 }, { "epoch": 2.5, "learning_rate": 3.2667771482260645e-08, "logits/chosen": -2.0663373470306396, "logits/rejected": -2.079287528991699, "logps/chosen": -4.109706401824951, "logps/rejected": -10.366329193115234, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 1.7395352125167847, "rewards/margins": 1.2046682834625244, "rewards/rejected": 0.5348669290542603, "step": 4630 }, { "epoch": 2.5, "learning_rate": 3.264728931082499e-08, "logits/chosen": -2.039050817489624, "logits/rejected": -2.2670931816101074, "logps/chosen": -3.9727344512939453, "logps/rejected": -3.508629560470581, "loss": 0.7055, "rewards/accuracies": 0.0, "rewards/chosen": 0.6358882784843445, "rewards/margins": -0.02465355396270752, "rewards/rejected": 0.660541832447052, "step": 4631 }, { "epoch": 2.5, "learning_rate": 3.262681044953378e-08, "logits/chosen": -2.035757064819336, "logits/rejected": -2.3202061653137207, "logps/chosen": -0.1957322508096695, "logps/rejected": -0.2248261272907257, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 0.7639858722686768, "rewards/margins": -0.006725609302520752, "rewards/rejected": 0.7707114815711975, "step": 4632 }, { "epoch": 2.5, "learning_rate": 3.260633490229348e-08, "logits/chosen": -2.0623483657836914, "logits/rejected": -2.2780439853668213, "logps/chosen": -1.1105314493179321, "logps/rejected": -1.122025489807129, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": 0.9995543360710144, "rewards/margins": -0.018674910068511963, "rewards/rejected": 1.0182292461395264, "step": 4633 }, { "epoch": 2.5, "learning_rate": 3.2585862673009936e-08, "logits/chosen": -2.1384377479553223, "logits/rejected": -2.1379857063293457, "logps/chosen": -0.12253248691558838, "logps/rejected": -5.992271423339844, "loss": 0.4052, "rewards/accuracies": 1.0, "rewards/chosen": 1.0145063400268555, "rewards/margins": 0.6940463781356812, "rewards/rejected": 0.32045993208885193, "step": 4634 }, { "epoch": 2.5, "learning_rate": 3.256539376558836e-08, "logits/chosen": -2.006439208984375, "logits/rejected": -2.2895872592926025, "logps/chosen": -0.7649654150009155, "logps/rejected": -0.6372827291488647, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 1.052561640739441, "rewards/margins": 0.006517887115478516, "rewards/rejected": 1.0460437536239624, "step": 4635 }, { "epoch": 2.5, "learning_rate": 3.254492818393333e-08, "logits/chosen": -2.0985493659973145, "logits/rejected": -2.0994274616241455, "logps/chosen": -0.31809473037719727, "logps/rejected": -5.034080505371094, "loss": 0.414, "rewards/accuracies": 1.0, "rewards/chosen": 1.025486946105957, "rewards/margins": 0.6677720546722412, "rewards/rejected": 0.35771486163139343, "step": 4636 }, { "epoch": 2.5, "learning_rate": 3.25244659319488e-08, "logits/chosen": -1.9542498588562012, "logits/rejected": -2.2788138389587402, "logps/chosen": -0.942737340927124, "logps/rejected": -0.9656818509101868, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8439252972602844, "rewards/margins": 0.022733867168426514, "rewards/rejected": 0.8211914300918579, "step": 4637 }, { "epoch": 2.5, "learning_rate": 3.2504007013538085e-08, "logits/chosen": -2.0418503284454346, "logits/rejected": -2.2864575386047363, "logps/chosen": -3.218268871307373, "logps/rejected": -2.9734699726104736, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.4890922009944916, "rewards/margins": 0.004987478256225586, "rewards/rejected": 0.484104722738266, "step": 4638 }, { "epoch": 2.5, "learning_rate": 3.248355143260383e-08, "logits/chosen": -2.1301684379577637, "logits/rejected": -2.12849497795105, "logps/chosen": -2.2441470623016357, "logps/rejected": -1.7071774005889893, "loss": 0.5102, "rewards/accuracies": 1.0, "rewards/chosen": 1.3785332441329956, "rewards/margins": 0.4070209264755249, "rewards/rejected": 0.9715123176574707, "step": 4639 }, { "epoch": 2.5, "learning_rate": 3.2463099193048106e-08, "logits/chosen": -2.0949573516845703, "logits/rejected": -2.0971434116363525, "logps/chosen": -1.9275974035263062, "logps/rejected": -6.147494792938232, "loss": 0.3583, "rewards/accuracies": 1.0, "rewards/chosen": 1.146917462348938, "rewards/margins": 0.8417624235153198, "rewards/rejected": 0.30515503883361816, "step": 4640 }, { "epoch": 2.5, "learning_rate": 3.244265029877229e-08, "logits/chosen": -2.0156965255737305, "logits/rejected": -2.22989821434021, "logps/chosen": -0.31319233775138855, "logps/rejected": -0.26655033230781555, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.9264995455741882, "rewards/margins": -0.0026317834854125977, "rewards/rejected": 0.9291313290596008, "step": 4641 }, { "epoch": 2.5, "learning_rate": 3.242220475367717e-08, "logits/chosen": -2.1255218982696533, "logits/rejected": -2.286342144012451, "logps/chosen": -1.267235279083252, "logps/rejected": -1.341261625289917, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 0.941209614276886, "rewards/margins": -0.014254450798034668, "rewards/rejected": 0.9554640650749207, "step": 4642 }, { "epoch": 2.5, "learning_rate": 3.240176256166284e-08, "logits/chosen": -2.070495367050171, "logits/rejected": -2.0747056007385254, "logps/chosen": -3.7863497734069824, "logps/rejected": -14.374035835266113, "loss": 0.3573, "rewards/accuracies": 1.0, "rewards/chosen": 0.9833887219429016, "rewards/margins": 0.8451391458511353, "rewards/rejected": 0.13824959099292755, "step": 4643 }, { "epoch": 2.5, "learning_rate": 3.2381323726628806e-08, "logits/chosen": -2.158334970474243, "logits/rejected": -2.140554428100586, "logps/chosen": -11.663993835449219, "logps/rejected": -9.345276832580566, "loss": 0.2301, "rewards/accuracies": 1.0, "rewards/chosen": 1.409825325012207, "rewards/margins": 1.35213303565979, "rewards/rejected": 0.05769224092364311, "step": 4644 }, { "epoch": 2.51, "learning_rate": 3.236088825247389e-08, "logits/chosen": -2.033698320388794, "logits/rejected": -2.2725203037261963, "logps/chosen": -1.4739668369293213, "logps/rejected": -4.30488920211792, "loss": 0.612, "rewards/accuracies": 1.0, "rewards/chosen": 0.9518632888793945, "rewards/margins": 0.16940146684646606, "rewards/rejected": 0.7824618220329285, "step": 4645 }, { "epoch": 2.51, "learning_rate": 3.234045614309632e-08, "logits/chosen": -2.127190351486206, "logits/rejected": -2.1520116329193115, "logps/chosen": -6.755316257476807, "logps/rejected": -1.523389220237732, "loss": 0.3868, "rewards/accuracies": 1.0, "rewards/chosen": 1.6788017749786377, "rewards/margins": 0.7501380443572998, "rewards/rejected": 0.9286637306213379, "step": 4646 }, { "epoch": 2.51, "learning_rate": 3.232002740239363e-08, "logits/chosen": -1.961897373199463, "logits/rejected": -1.957028865814209, "logps/chosen": -4.397825241088867, "logps/rejected": -5.647474765777588, "loss": 0.3011, "rewards/accuracies": 1.0, "rewards/chosen": 1.4572985172271729, "rewards/margins": 1.0461684465408325, "rewards/rejected": 0.4111301004886627, "step": 4647 }, { "epoch": 2.51, "learning_rate": 3.229960203426275e-08, "logits/chosen": -2.0614941120147705, "logits/rejected": -2.0472333431243896, "logps/chosen": -13.4349946975708, "logps/rejected": -4.566973686218262, "loss": 0.5204, "rewards/accuracies": 1.0, "rewards/chosen": 1.4386647939682007, "rewards/margins": 0.3816337585449219, "rewards/rejected": 1.0570310354232788, "step": 4648 }, { "epoch": 2.51, "learning_rate": 3.2279180042599966e-08, "logits/chosen": -2.0855445861816406, "logits/rejected": -2.060234785079956, "logps/chosen": -5.53366756439209, "logps/rejected": -3.882923126220703, "loss": 0.3228, "rewards/accuracies": 1.0, "rewards/chosen": 1.5616776943206787, "rewards/margins": 0.9650833606719971, "rewards/rejected": 0.5965943336486816, "step": 4649 }, { "epoch": 2.51, "learning_rate": 3.225876143130086e-08, "logits/chosen": -2.0465588569641113, "logits/rejected": -2.2970926761627197, "logps/chosen": -4.024991989135742, "logps/rejected": -3.867615222930908, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.576964795589447, "rewards/margins": 0.012394964694976807, "rewards/rejected": 0.5645698308944702, "step": 4650 }, { "epoch": 2.51, "learning_rate": 3.223834620426048e-08, "logits/chosen": -1.9793672561645508, "logits/rejected": -2.2229645252227783, "logps/chosen": -0.3685765862464905, "logps/rejected": -0.37311577796936035, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.9628031849861145, "rewards/margins": 0.008763432502746582, "rewards/rejected": 0.9540397524833679, "step": 4651 }, { "epoch": 2.51, "learning_rate": 3.221793436537313e-08, "logits/chosen": -2.0017216205596924, "logits/rejected": -2.020120143890381, "logps/chosen": -1.930480718612671, "logps/rejected": -5.188549995422363, "loss": 0.4188, "rewards/accuracies": 1.0, "rewards/chosen": 1.2390902042388916, "rewards/margins": 0.6536975502967834, "rewards/rejected": 0.5853926539421082, "step": 4652 }, { "epoch": 2.51, "learning_rate": 3.219752591853252e-08, "logits/chosen": -2.058712959289551, "logits/rejected": -2.283729314804077, "logps/chosen": -0.93104088306427, "logps/rejected": -2.1047708988189697, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 1.0503379106521606, "rewards/margins": -0.006809234619140625, "rewards/rejected": 1.0571471452713013, "step": 4653 }, { "epoch": 2.51, "learning_rate": 3.217712086763168e-08, "logits/chosen": -2.05161190032959, "logits/rejected": -2.256119966506958, "logps/chosen": -9.160792350769043, "logps/rejected": -8.286491394042969, "loss": 0.7426, "rewards/accuracies": 0.0, "rewards/chosen": 0.7757481932640076, "rewards/margins": -0.0966460108757019, "rewards/rejected": 0.8723942041397095, "step": 4654 }, { "epoch": 2.51, "learning_rate": 3.215671921656303e-08, "logits/chosen": -1.9701248407363892, "logits/rejected": -2.2788989543914795, "logps/chosen": -0.9763145446777344, "logps/rejected": -0.8791952133178711, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9519028663635254, "rewards/margins": 0.003711879253387451, "rewards/rejected": 0.9481909871101379, "step": 4655 }, { "epoch": 2.51, "learning_rate": 3.21363209692183e-08, "logits/chosen": -2.106851577758789, "logits/rejected": -2.1091690063476562, "logps/chosen": -5.594420433044434, "logps/rejected": -10.02022647857666, "loss": 0.2392, "rewards/accuracies": 1.0, "rewards/chosen": 1.626063585281372, "rewards/margins": 1.3082435131072998, "rewards/rejected": 0.31782007217407227, "step": 4656 }, { "epoch": 2.51, "learning_rate": 3.211592612948859e-08, "logits/chosen": -2.0861363410949707, "logits/rejected": -2.0843827724456787, "logps/chosen": -0.5226503610610962, "logps/rejected": -5.678267478942871, "loss": 0.4075, "rewards/accuracies": 1.0, "rewards/chosen": 0.9329947829246521, "rewards/margins": 0.6870661973953247, "rewards/rejected": 0.2459285706281662, "step": 4657 }, { "epoch": 2.51, "learning_rate": 3.209553470126436e-08, "logits/chosen": -2.0666232109069824, "logits/rejected": -2.2615416049957275, "logps/chosen": -4.362997531890869, "logps/rejected": -0.8303827047348022, "loss": 0.845, "rewards/accuracies": 0.0, "rewards/chosen": 0.8108366131782532, "rewards/margins": -0.28369468450546265, "rewards/rejected": 1.0945312976837158, "step": 4658 }, { "epoch": 2.51, "learning_rate": 3.2075146688435406e-08, "logits/chosen": -1.9482728242874146, "logits/rejected": -1.9571582078933716, "logps/chosen": -1.2489432096481323, "logps/rejected": -4.275519847869873, "loss": 0.4469, "rewards/accuracies": 1.0, "rewards/chosen": 0.9555906653404236, "rewards/margins": 0.5735492706298828, "rewards/rejected": 0.38204142451286316, "step": 4659 }, { "epoch": 2.51, "learning_rate": 3.2054762094890886e-08, "logits/chosen": -2.150420904159546, "logits/rejected": -2.1320345401763916, "logps/chosen": -6.6233696937561035, "logps/rejected": -5.5549445152282715, "loss": 0.3543, "rewards/accuracies": 1.0, "rewards/chosen": 1.3821970224380493, "rewards/margins": 0.8552396297454834, "rewards/rejected": 0.5269573926925659, "step": 4660 }, { "epoch": 2.51, "learning_rate": 3.203438092451928e-08, "logits/chosen": -1.9659901857376099, "logits/rejected": -2.2219767570495605, "logps/chosen": -0.7378581166267395, "logps/rejected": -0.8499839305877686, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.8624513745307922, "rewards/margins": 0.03697633743286133, "rewards/rejected": 0.8254750370979309, "step": 4661 }, { "epoch": 2.51, "learning_rate": 3.201400318120843e-08, "logits/chosen": -2.222552537918091, "logits/rejected": -2.328946828842163, "logps/chosen": -16.306724548339844, "logps/rejected": -7.802353858947754, "loss": 0.5896, "rewards/accuracies": 1.0, "rewards/chosen": 1.235542893409729, "rewards/margins": 0.2189946174621582, "rewards/rejected": 1.0165482759475708, "step": 4662 }, { "epoch": 2.52, "learning_rate": 3.1993628868845544e-08, "logits/chosen": -2.0910749435424805, "logits/rejected": -2.347452402114868, "logps/chosen": -2.8145387172698975, "logps/rejected": -2.7897531986236572, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 1.0045092105865479, "rewards/margins": 0.006726324558258057, "rewards/rejected": 0.9977828860282898, "step": 4663 }, { "epoch": 2.52, "learning_rate": 3.197325799131715e-08, "logits/chosen": -2.0623886585235596, "logits/rejected": -2.3102266788482666, "logps/chosen": -1.105375051498413, "logps/rejected": -3.7555699348449707, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 1.0622936487197876, "rewards/margins": 0.02273249626159668, "rewards/rejected": 1.039561152458191, "step": 4664 }, { "epoch": 2.52, "learning_rate": 3.195289055250913e-08, "logits/chosen": -2.080151081085205, "logits/rejected": -2.085848808288574, "logps/chosen": -1.0516271591186523, "logps/rejected": -1.7754467725753784, "loss": 0.489, "rewards/accuracies": 1.0, "rewards/chosen": 1.18975830078125, "rewards/margins": 0.46098649501800537, "rewards/rejected": 0.7287718057632446, "step": 4665 }, { "epoch": 2.52, "learning_rate": 3.1932526556306696e-08, "logits/chosen": -2.0413081645965576, "logits/rejected": -2.0518906116485596, "logps/chosen": -1.4246668815612793, "logps/rejected": -2.6533827781677246, "loss": 0.4268, "rewards/accuracies": 1.0, "rewards/chosen": 1.234084129333496, "rewards/margins": 0.6304935812950134, "rewards/rejected": 0.6035905480384827, "step": 4666 }, { "epoch": 2.52, "learning_rate": 3.1912166006594424e-08, "logits/chosen": -2.0471110343933105, "logits/rejected": -2.05598521232605, "logps/chosen": -2.095437526702881, "logps/rejected": -2.425565719604492, "loss": 0.4475, "rewards/accuracies": 1.0, "rewards/chosen": 1.2578868865966797, "rewards/margins": 0.5719102621078491, "rewards/rejected": 0.6859766244888306, "step": 4667 }, { "epoch": 2.52, "learning_rate": 3.1891808907256226e-08, "logits/chosen": -1.8653122186660767, "logits/rejected": -2.284432888031006, "logps/chosen": -0.45883020758628845, "logps/rejected": -0.6525712013244629, "loss": 0.6631, "rewards/accuracies": 1.0, "rewards/chosen": 0.8166743516921997, "rewards/margins": 0.061053454875946045, "rewards/rejected": 0.7556208968162537, "step": 4668 }, { "epoch": 2.52, "learning_rate": 3.187145526217535e-08, "logits/chosen": -1.951366662979126, "logits/rejected": -2.3089284896850586, "logps/chosen": -0.26995649933815, "logps/rejected": -0.193216010928154, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8410903215408325, "rewards/margins": 0.01838284730911255, "rewards/rejected": 0.82270747423172, "step": 4669 }, { "epoch": 2.52, "learning_rate": 3.185110507523439e-08, "logits/chosen": -2.222062826156616, "logits/rejected": -2.188051462173462, "logps/chosen": -12.91383171081543, "logps/rejected": -3.0357651710510254, "loss": 0.4689, "rewards/accuracies": 1.0, "rewards/chosen": 1.2516233921051025, "rewards/margins": 0.513809859752655, "rewards/rejected": 0.7378135323524475, "step": 4670 }, { "epoch": 2.52, "learning_rate": 3.183075835031527e-08, "logits/chosen": -1.9655652046203613, "logits/rejected": -2.2629776000976562, "logps/chosen": -1.25869882106781, "logps/rejected": -1.1857270002365112, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.8960893750190735, "rewards/margins": 0.004760563373565674, "rewards/rejected": 0.8913288116455078, "step": 4671 }, { "epoch": 2.52, "learning_rate": 3.181041509129926e-08, "logits/chosen": -2.0167765617370605, "logits/rejected": -2.3286654949188232, "logps/chosen": -0.6870788931846619, "logps/rejected": -0.8137326240539551, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.8754633069038391, "rewards/margins": 0.007240116596221924, "rewards/rejected": 0.8682231903076172, "step": 4672 }, { "epoch": 2.52, "learning_rate": 3.1790075302066987e-08, "logits/chosen": -1.9845119714736938, "logits/rejected": -2.2609729766845703, "logps/chosen": -0.34069588780403137, "logps/rejected": -0.4103164076805115, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0141476392745972, "rewards/margins": 0.0240975022315979, "rewards/rejected": 0.9900501370429993, "step": 4673 }, { "epoch": 2.52, "learning_rate": 3.176973898649839e-08, "logits/chosen": -2.0537521839141846, "logits/rejected": -2.24137806892395, "logps/chosen": -1.3293954133987427, "logps/rejected": -1.0249526500701904, "loss": 0.5812, "rewards/accuracies": 1.0, "rewards/chosen": 1.1365607976913452, "rewards/margins": 0.2380252480506897, "rewards/rejected": 0.8985355496406555, "step": 4674 }, { "epoch": 2.52, "learning_rate": 3.174940614847278e-08, "logits/chosen": -2.0335707664489746, "logits/rejected": -2.310297966003418, "logps/chosen": -1.1712921857833862, "logps/rejected": -1.316495656967163, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.915553867816925, "rewards/margins": 0.009486973285675049, "rewards/rejected": 0.90606689453125, "step": 4675 }, { "epoch": 2.52, "learning_rate": 3.1729076791868746e-08, "logits/chosen": -2.0868303775787354, "logits/rejected": -2.08931040763855, "logps/chosen": -2.3463282585144043, "logps/rejected": -2.7969470024108887, "loss": 0.6055, "rewards/accuracies": 1.0, "rewards/chosen": 1.0319786071777344, "rewards/margins": 0.18371087312698364, "rewards/rejected": 0.8482677340507507, "step": 4676 }, { "epoch": 2.52, "learning_rate": 3.1708750920564264e-08, "logits/chosen": -2.1307284832000732, "logits/rejected": -2.1354901790618896, "logps/chosen": -10.509021759033203, "logps/rejected": -7.7584404945373535, "loss": 0.1814, "rewards/accuracies": 1.0, "rewards/chosen": 2.3497207164764404, "rewards/margins": 1.6152385473251343, "rewards/rejected": 0.7344821691513062, "step": 4677 }, { "epoch": 2.52, "learning_rate": 3.1688428538436626e-08, "logits/chosen": -2.0592234134674072, "logits/rejected": -2.294265031814575, "logps/chosen": -1.1478636264801025, "logps/rejected": -1.6767299175262451, "loss": 0.7019, "rewards/accuracies": 0.0, "rewards/chosen": 0.9002752304077148, "rewards/margins": -0.017440974712371826, "rewards/rejected": 0.9177162051200867, "step": 4678 }, { "epoch": 2.52, "learning_rate": 3.166810964936247e-08, "logits/chosen": -2.0503857135772705, "logits/rejected": -2.0496420860290527, "logps/chosen": -0.2772080898284912, "logps/rejected": -5.496967792510986, "loss": 0.4515, "rewards/accuracies": 1.0, "rewards/chosen": 0.9420620799064636, "rewards/margins": 0.5610126256942749, "rewards/rejected": 0.3810494840145111, "step": 4679 }, { "epoch": 2.52, "learning_rate": 3.164779425721774e-08, "logits/chosen": -2.0240683555603027, "logits/rejected": -2.190688133239746, "logps/chosen": -0.21657635271549225, "logps/rejected": -0.22569985687732697, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8345630764961243, "rewards/margins": 0.018168091773986816, "rewards/rejected": 0.8163949847221375, "step": 4680 }, { "epoch": 2.52, "learning_rate": 3.1627482365877745e-08, "logits/chosen": -2.1077253818511963, "logits/rejected": -2.1095666885375977, "logps/chosen": -4.157678604125977, "logps/rejected": -2.661421060562134, "loss": 0.2323, "rewards/accuracies": 1.0, "rewards/chosen": 1.9146589040756226, "rewards/margins": 1.3413314819335938, "rewards/rejected": 0.573327362537384, "step": 4681 }, { "epoch": 2.53, "learning_rate": 3.16071739792171e-08, "logits/chosen": -2.1901957988739014, "logits/rejected": -2.188340425491333, "logps/chosen": -3.2516403198242188, "logps/rejected": -5.558182716369629, "loss": 0.2166, "rewards/accuracies": 1.0, "rewards/chosen": 1.7585538625717163, "rewards/margins": 1.4194340705871582, "rewards/rejected": 0.33911973237991333, "step": 4682 }, { "epoch": 2.53, "learning_rate": 3.158686910110979e-08, "logits/chosen": -2.0899548530578613, "logits/rejected": -2.2932825088500977, "logps/chosen": -2.3462555408477783, "logps/rejected": -2.2565999031066895, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.8844661116600037, "rewards/margins": -0.004984557628631592, "rewards/rejected": 0.8894506692886353, "step": 4683 }, { "epoch": 2.53, "learning_rate": 3.1566567735429056e-08, "logits/chosen": -2.141632556915283, "logits/rejected": -2.1387887001037598, "logps/chosen": -6.401864528656006, "logps/rejected": -3.9871866703033447, "loss": 0.2983, "rewards/accuracies": 1.0, "rewards/chosen": 1.632250428199768, "rewards/margins": 1.0567231178283691, "rewards/rejected": 0.5755273103713989, "step": 4684 }, { "epoch": 2.53, "learning_rate": 3.154626988604758e-08, "logits/chosen": -2.1262564659118652, "logits/rejected": -2.272423028945923, "logps/chosen": -1.4627915620803833, "logps/rejected": -4.977847099304199, "loss": 0.6144, "rewards/accuracies": 1.0, "rewards/chosen": 0.9171172976493835, "rewards/margins": 0.16416233777999878, "rewards/rejected": 0.7529549598693848, "step": 4685 }, { "epoch": 2.53, "learning_rate": 3.152597555683728e-08, "logits/chosen": -2.1470465660095215, "logits/rejected": -1.9497820138931274, "logps/chosen": -34.830814361572266, "logps/rejected": -3.9305617809295654, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 2.4505977630615234, "rewards/margins": 1.930521011352539, "rewards/rejected": 0.5200766921043396, "step": 4686 }, { "epoch": 2.53, "learning_rate": 3.150568475166943e-08, "logits/chosen": -2.0460994243621826, "logits/rejected": -2.2896976470947266, "logps/chosen": -0.38820546865463257, "logps/rejected": -0.3978830873966217, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.8729276061058044, "rewards/margins": 0.00243455171585083, "rewards/rejected": 0.8704930543899536, "step": 4687 }, { "epoch": 2.53, "learning_rate": 3.1485397474414635e-08, "logits/chosen": -2.186321496963501, "logits/rejected": -2.182314395904541, "logps/chosen": -3.4625802040100098, "logps/rejected": -12.249813079833984, "loss": 0.3328, "rewards/accuracies": 1.0, "rewards/chosen": 1.2517813444137573, "rewards/margins": 0.929071307182312, "rewards/rejected": 0.3227100372314453, "step": 4688 }, { "epoch": 2.53, "learning_rate": 3.146511372894283e-08, "logits/chosen": -1.9361172914505005, "logits/rejected": -2.2968499660491943, "logps/chosen": -0.11615078896284103, "logps/rejected": -0.12147749215364456, "loss": 0.6716, "rewards/accuracies": 1.0, "rewards/chosen": 0.8917223215103149, "rewards/margins": 0.04353970289230347, "rewards/rejected": 0.8481826186180115, "step": 4689 }, { "epoch": 2.53, "learning_rate": 3.1444833519123284e-08, "logits/chosen": -2.073080062866211, "logits/rejected": -2.070847988128662, "logps/chosen": -1.2167611122131348, "logps/rejected": -6.058435440063477, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": 1.1088155508041382, "rewards/margins": 0.6945543885231018, "rewards/rejected": 0.4142611622810364, "step": 4690 }, { "epoch": 2.53, "learning_rate": 3.142455684882457e-08, "logits/chosen": -2.133798360824585, "logits/rejected": -2.1104705333709717, "logps/chosen": -11.234331130981445, "logps/rejected": -5.199455261230469, "loss": 0.4795, "rewards/accuracies": 1.0, "rewards/chosen": 1.1931265592575073, "rewards/margins": 0.4856211543083191, "rewards/rejected": 0.7075054049491882, "step": 4691 }, { "epoch": 2.53, "learning_rate": 3.14042837219146e-08, "logits/chosen": -2.0554423332214355, "logits/rejected": -2.280540943145752, "logps/chosen": -0.24366432428359985, "logps/rejected": -0.2348492294549942, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8121404051780701, "rewards/margins": 0.022386252880096436, "rewards/rejected": 0.7897541522979736, "step": 4692 }, { "epoch": 2.53, "learning_rate": 3.138401414226061e-08, "logits/chosen": -2.0966622829437256, "logits/rejected": -2.2988667488098145, "logps/chosen": -1.180406928062439, "logps/rejected": -1.1671080589294434, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.7196118235588074, "rewards/margins": 0.013449668884277344, "rewards/rejected": 0.70616215467453, "step": 4693 }, { "epoch": 2.53, "learning_rate": 3.1363748113729156e-08, "logits/chosen": -2.077202320098877, "logits/rejected": -2.3357787132263184, "logps/chosen": -0.1053643673658371, "logps/rejected": -0.12067835032939911, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.8606893420219421, "rewards/margins": 0.015739023685455322, "rewards/rejected": 0.8449503183364868, "step": 4694 }, { "epoch": 2.53, "learning_rate": 3.13434856401861e-08, "logits/chosen": -2.138319730758667, "logits/rejected": -2.250641107559204, "logps/chosen": -5.047634601593018, "logps/rejected": -0.6175521612167358, "loss": 0.5551, "rewards/accuracies": 1.0, "rewards/chosen": 1.163067102432251, "rewards/margins": 0.2981337904930115, "rewards/rejected": 0.8649333119392395, "step": 4695 }, { "epoch": 2.53, "learning_rate": 3.132322672549668e-08, "logits/chosen": -2.1396114826202393, "logits/rejected": -2.335864543914795, "logps/chosen": -1.2725409269332886, "logps/rejected": -1.295606017112732, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0307915210723877, "rewards/margins": 0.023373723030090332, "rewards/rejected": 1.0074177980422974, "step": 4696 }, { "epoch": 2.53, "learning_rate": 3.130297137352538e-08, "logits/chosen": -2.071394205093384, "logits/rejected": -2.158778667449951, "logps/chosen": -2.6876583099365234, "logps/rejected": -18.922508239746094, "loss": 0.4045, "rewards/accuracies": 1.0, "rewards/chosen": 1.3539618253707886, "rewards/margins": 0.6959894895553589, "rewards/rejected": 0.6579723358154297, "step": 4697 }, { "epoch": 2.53, "learning_rate": 3.128271958813607e-08, "logits/chosen": -1.9594528675079346, "logits/rejected": -2.274553060531616, "logps/chosen": -0.22824062407016754, "logps/rejected": -0.2784860134124756, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9199005961418152, "rewards/margins": 0.012360095977783203, "rewards/rejected": 0.907540500164032, "step": 4698 }, { "epoch": 2.53, "learning_rate": 3.1262471373191904e-08, "logits/chosen": -1.9531253576278687, "logits/rejected": -2.3189213275909424, "logps/chosen": -2.422548294067383, "logps/rejected": -0.7680299282073975, "loss": 0.6623, "rewards/accuracies": 1.0, "rewards/chosen": 0.9439547657966614, "rewards/margins": 0.06273823976516724, "rewards/rejected": 0.8812165260314941, "step": 4699 }, { "epoch": 2.54, "learning_rate": 3.124222673255535e-08, "logits/chosen": -2.0988128185272217, "logits/rejected": -2.3445005416870117, "logps/chosen": -0.6267744302749634, "logps/rejected": -6.632705211639404, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 1.0783876180648804, "rewards/margins": 0.19902092218399048, "rewards/rejected": 0.8793666958808899, "step": 4700 }, { "epoch": 2.54, "learning_rate": 3.122198567008822e-08, "logits/chosen": -2.0839641094207764, "logits/rejected": -2.274152994155884, "logps/chosen": -9.163808822631836, "logps/rejected": -4.988466739654541, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 1.1211519241333008, "rewards/margins": 0.028592824935913086, "rewards/rejected": 1.0925590991973877, "step": 4701 }, { "epoch": 2.54, "learning_rate": 3.120174818965164e-08, "logits/chosen": -2.1086080074310303, "logits/rejected": -2.1184699535369873, "logps/chosen": -2.751605272293091, "logps/rejected": -3.0703723430633545, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 1.0091311931610107, "rewards/margins": 0.5210957527160645, "rewards/rejected": 0.4880354106426239, "step": 4702 }, { "epoch": 2.54, "learning_rate": 3.118151429510602e-08, "logits/chosen": -2.109635353088379, "logits/rejected": -2.3497326374053955, "logps/chosen": -2.099571466445923, "logps/rejected": -2.0309667587280273, "loss": 0.6725, "rewards/accuracies": 1.0, "rewards/chosen": 1.008696436882019, "rewards/margins": 0.041714370250701904, "rewards/rejected": 0.9669820666313171, "step": 4703 }, { "epoch": 2.54, "learning_rate": 3.116128399031112e-08, "logits/chosen": -2.0442285537719727, "logits/rejected": -2.234499931335449, "logps/chosen": -0.560950756072998, "logps/rejected": -0.5212286710739136, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9069709777832031, "rewards/margins": 0.022497951984405518, "rewards/rejected": 0.8844730257987976, "step": 4704 }, { "epoch": 2.54, "learning_rate": 3.1141057279126005e-08, "logits/chosen": -2.0523691177368164, "logits/rejected": -2.109921455383301, "logps/chosen": -1.1771368980407715, "logps/rejected": -8.458596229553223, "loss": 0.418, "rewards/accuracies": 1.0, "rewards/chosen": 1.5394508838653564, "rewards/margins": 0.6560285091400146, "rewards/rejected": 0.8834223747253418, "step": 4705 }, { "epoch": 2.54, "learning_rate": 3.112083416540906e-08, "logits/chosen": -2.2520627975463867, "logits/rejected": -2.160412073135376, "logps/chosen": -19.889009475708008, "logps/rejected": -4.220243453979492, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 2.0664992332458496, "rewards/margins": 1.4371178150177002, "rewards/rejected": 0.6293813586235046, "step": 4706 }, { "epoch": 2.54, "learning_rate": 3.1100614653017944e-08, "logits/chosen": -2.1324689388275146, "logits/rejected": -2.1294326782226562, "logps/chosen": -3.803227424621582, "logps/rejected": -1.4097044467926025, "loss": 0.3769, "rewards/accuracies": 1.0, "rewards/chosen": 1.6321789026260376, "rewards/margins": 0.7813928723335266, "rewards/rejected": 0.850786030292511, "step": 4707 }, { "epoch": 2.54, "learning_rate": 3.108039874580973e-08, "logits/chosen": -2.1591408252716064, "logits/rejected": -2.1697680950164795, "logps/chosen": -2.7238335609436035, "logps/rejected": -4.898429870605469, "loss": 0.4515, "rewards/accuracies": 1.0, "rewards/chosen": 0.9927726984024048, "rewards/margins": 0.5610525608062744, "rewards/rejected": 0.43172016739845276, "step": 4708 }, { "epoch": 2.54, "learning_rate": 3.106018644764068e-08, "logits/chosen": -2.1182613372802734, "logits/rejected": -2.094088554382324, "logps/chosen": -12.572310447692871, "logps/rejected": -2.0988690853118896, "loss": 0.3841, "rewards/accuracies": 1.0, "rewards/chosen": 1.4587161540985107, "rewards/margins": 0.7586721777915955, "rewards/rejected": 0.7000439763069153, "step": 4709 }, { "epoch": 2.54, "learning_rate": 3.103997776236644e-08, "logits/chosen": -2.0868513584136963, "logits/rejected": -2.0819716453552246, "logps/chosen": -0.10401399433612823, "logps/rejected": -12.705702781677246, "loss": 0.4643, "rewards/accuracies": 1.0, "rewards/chosen": 0.8809059262275696, "rewards/margins": 0.5260974168777466, "rewards/rejected": 0.3548085391521454, "step": 4710 }, { "epoch": 2.54, "learning_rate": 3.101977269384195e-08, "logits/chosen": -2.0205132961273193, "logits/rejected": -2.224790334701538, "logps/chosen": -1.3372749090194702, "logps/rejected": -1.3375047445297241, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 1.0028600692749023, "rewards/margins": 0.007322967052459717, "rewards/rejected": 0.9955371022224426, "step": 4711 }, { "epoch": 2.54, "learning_rate": 3.099957124592147e-08, "logits/chosen": -2.0298380851745605, "logits/rejected": -2.0301096439361572, "logps/chosen": -0.8824242353439331, "logps/rejected": -7.3245158195495605, "loss": 0.4318, "rewards/accuracies": 1.0, "rewards/chosen": 0.9577957987785339, "rewards/margins": 0.6161074042320251, "rewards/rejected": 0.3416883945465088, "step": 4712 }, { "epoch": 2.54, "learning_rate": 3.097937342245854e-08, "logits/chosen": -1.9736473560333252, "logits/rejected": -1.9804978370666504, "logps/chosen": -3.2025699615478516, "logps/rejected": -5.5171051025390625, "loss": 0.4163, "rewards/accuracies": 1.0, "rewards/chosen": 1.1725828647613525, "rewards/margins": 0.6611044406890869, "rewards/rejected": 0.5114784240722656, "step": 4713 }, { "epoch": 2.54, "learning_rate": 3.0959179227306056e-08, "logits/chosen": -2.0740020275115967, "logits/rejected": -2.3380284309387207, "logps/chosen": -0.8238096237182617, "logps/rejected": -0.9980888366699219, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.7490502595901489, "rewards/margins": 0.004313290119171143, "rewards/rejected": 0.7447369694709778, "step": 4714 }, { "epoch": 2.54, "learning_rate": 3.093898866431617e-08, "logits/chosen": -2.065011501312256, "logits/rejected": -2.2715015411376953, "logps/chosen": -0.3441579043865204, "logps/rejected": -0.33252769708633423, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.8260638117790222, "rewards/margins": -0.0036993026733398438, "rewards/rejected": 0.8297631144523621, "step": 4715 }, { "epoch": 2.54, "learning_rate": 3.0918801737340374e-08, "logits/chosen": -2.1471335887908936, "logits/rejected": -2.271002769470215, "logps/chosen": -6.5109663009643555, "logps/rejected": -6.901718616485596, "loss": 0.657, "rewards/accuracies": 1.0, "rewards/chosen": 1.2469123601913452, "rewards/margins": 0.0735769271850586, "rewards/rejected": 1.1733354330062866, "step": 4716 }, { "epoch": 2.54, "learning_rate": 3.089861845022945e-08, "logits/chosen": -2.133878231048584, "logits/rejected": -2.3001179695129395, "logps/chosen": -3.391301155090332, "logps/rejected": -3.3278112411499023, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.8042703866958618, "rewards/margins": -0.004459679126739502, "rewards/rejected": 0.8087300658226013, "step": 4717 }, { "epoch": 2.54, "learning_rate": 3.0878438806833506e-08, "logits/chosen": -2.1387228965759277, "logits/rejected": -2.2479305267333984, "logps/chosen": -0.7496032118797302, "logps/rejected": -2.1137173175811768, "loss": 0.5858, "rewards/accuracies": 1.0, "rewards/chosen": 0.9951842427253723, "rewards/margins": 0.22753190994262695, "rewards/rejected": 0.7676523327827454, "step": 4718 }, { "epoch": 2.55, "learning_rate": 3.085826281100194e-08, "logits/chosen": -2.117544412612915, "logits/rejected": -2.3643646240234375, "logps/chosen": -1.6034570932388306, "logps/rejected": -1.749697208404541, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.7361106276512146, "rewards/margins": 0.017772376537322998, "rewards/rejected": 0.7183382511138916, "step": 4719 }, { "epoch": 2.55, "learning_rate": 3.083809046658346e-08, "logits/chosen": -2.221888780593872, "logits/rejected": -2.285224676132202, "logps/chosen": -0.9148879051208496, "logps/rejected": -0.9996294379234314, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 0.9328014254570007, "rewards/margins": 0.03392016887664795, "rewards/rejected": 0.8988812565803528, "step": 4720 }, { "epoch": 2.55, "learning_rate": 3.081792177742606e-08, "logits/chosen": -2.093935251235962, "logits/rejected": -2.0935065746307373, "logps/chosen": -1.0454626083374023, "logps/rejected": -1.7932937145233154, "loss": 0.6387, "rewards/accuracies": 1.0, "rewards/chosen": 0.865519642829895, "rewards/margins": 0.11199730634689331, "rewards/rejected": 0.7535223364830017, "step": 4721 }, { "epoch": 2.55, "learning_rate": 3.079775674737707e-08, "logits/chosen": -2.023972272872925, "logits/rejected": -2.2449116706848145, "logps/chosen": -0.4939013123512268, "logps/rejected": -0.459354430437088, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 1.0024566650390625, "rewards/margins": 0.045818865299224854, "rewards/rejected": 0.9566377997398376, "step": 4722 }, { "epoch": 2.55, "learning_rate": 3.077759538028309e-08, "logits/chosen": -1.9682321548461914, "logits/rejected": -2.3270275592803955, "logps/chosen": -0.6223335862159729, "logps/rejected": -10.606398582458496, "loss": 0.8126, "rewards/accuracies": 0.0, "rewards/chosen": 0.8902115225791931, "rewards/margins": -0.22619277238845825, "rewards/rejected": 1.1164042949676514, "step": 4723 }, { "epoch": 2.55, "learning_rate": 3.075743767999004e-08, "logits/chosen": -2.0363879203796387, "logits/rejected": -2.2681519985198975, "logps/chosen": -4.865253448486328, "logps/rejected": -4.730684280395508, "loss": 0.7046, "rewards/accuracies": 0.0, "rewards/chosen": 0.9974336624145508, "rewards/margins": -0.022791743278503418, "rewards/rejected": 1.0202254056930542, "step": 4724 }, { "epoch": 2.55, "learning_rate": 3.073728365034313e-08, "logits/chosen": -2.029567003250122, "logits/rejected": -2.027618646621704, "logps/chosen": -0.45147234201431274, "logps/rejected": -3.1891000270843506, "loss": 0.502, "rewards/accuracies": 1.0, "rewards/chosen": 1.048287272453308, "rewards/margins": 0.42755836248397827, "rewards/rejected": 0.6207289099693298, "step": 4725 }, { "epoch": 2.55, "learning_rate": 3.071713329518688e-08, "logits/chosen": -2.130840539932251, "logits/rejected": -2.396355628967285, "logps/chosen": -10.08957290649414, "logps/rejected": -8.60531997680664, "loss": 0.7213, "rewards/accuracies": 0.0, "rewards/chosen": 0.7488851547241211, "rewards/margins": -0.055444180965423584, "rewards/rejected": 0.8043293356895447, "step": 4726 }, { "epoch": 2.55, "learning_rate": 3.0696986618365096e-08, "logits/chosen": -2.0354373455047607, "logits/rejected": -2.2514476776123047, "logps/chosen": -0.4448440372943878, "logps/rejected": -0.5155801773071289, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.7931506037712097, "rewards/margins": 0.03480786085128784, "rewards/rejected": 0.7583427429199219, "step": 4727 }, { "epoch": 2.55, "learning_rate": 3.0676843623720893e-08, "logits/chosen": -2.1032347679138184, "logits/rejected": -2.104511022567749, "logps/chosen": -1.7584550380706787, "logps/rejected": -5.964181423187256, "loss": 0.286, "rewards/accuracies": 1.0, "rewards/chosen": 1.5961908102035522, "rewards/margins": 1.1053162813186646, "rewards/rejected": 0.4908745288848877, "step": 4728 }, { "epoch": 2.55, "learning_rate": 3.065670431509667e-08, "logits/chosen": -2.0635955333709717, "logits/rejected": -2.2902023792266846, "logps/chosen": -0.47433698177337646, "logps/rejected": -0.552821695804596, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 1.0630195140838623, "rewards/margins": 0.01835763454437256, "rewards/rejected": 1.0446618795394897, "step": 4729 }, { "epoch": 2.55, "learning_rate": 3.063656869633416e-08, "logits/chosen": -2.2101895809173584, "logits/rejected": -2.287809371948242, "logps/chosen": -3.7212066650390625, "logps/rejected": -25.305767059326172, "loss": 0.4179, "rewards/accuracies": 1.0, "rewards/chosen": 1.150708794593811, "rewards/margins": 0.6563618183135986, "rewards/rejected": 0.4943470060825348, "step": 4730 }, { "epoch": 2.55, "learning_rate": 3.061643677127435e-08, "logits/chosen": -2.1649467945098877, "logits/rejected": -2.305906057357788, "logps/chosen": -0.9567683935165405, "logps/rejected": -7.682928562164307, "loss": 0.5515, "rewards/accuracies": 1.0, "rewards/chosen": 1.1756646633148193, "rewards/margins": 0.3067935109138489, "rewards/rejected": 0.8688711524009705, "step": 4731 }, { "epoch": 2.55, "learning_rate": 3.059630854375755e-08, "logits/chosen": -2.098853826522827, "logits/rejected": -2.3092963695526123, "logps/chosen": -0.49557650089263916, "logps/rejected": -0.4715122580528259, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 1.0070817470550537, "rewards/margins": 0.005070805549621582, "rewards/rejected": 1.0020109415054321, "step": 4732 }, { "epoch": 2.55, "learning_rate": 3.057618401762333e-08, "logits/chosen": -2.0956931114196777, "logits/rejected": -2.3146634101867676, "logps/chosen": -24.9302978515625, "logps/rejected": -26.226160049438477, "loss": 0.7057, "rewards/accuracies": 0.0, "rewards/chosen": 0.919543445110321, "rewards/margins": -0.025023460388183594, "rewards/rejected": 0.9445669054985046, "step": 4733 }, { "epoch": 2.55, "learning_rate": 3.055606319671059e-08, "logits/chosen": -2.0688862800598145, "logits/rejected": -2.2495481967926025, "logps/chosen": -0.3266226351261139, "logps/rejected": -0.22605417668819427, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.7866491675376892, "rewards/margins": 0.01691722869873047, "rewards/rejected": 0.7697319388389587, "step": 4734 }, { "epoch": 2.55, "learning_rate": 3.05359460848575e-08, "logits/chosen": -2.027355194091797, "logits/rejected": -2.250683069229126, "logps/chosen": -1.9136501550674438, "logps/rejected": -5.994284152984619, "loss": 0.6432, "rewards/accuracies": 1.0, "rewards/chosen": 0.9471467137336731, "rewards/margins": 0.10245907306671143, "rewards/rejected": 0.8446876406669617, "step": 4735 }, { "epoch": 2.55, "learning_rate": 3.051583268590155e-08, "logits/chosen": -2.1865267753601074, "logits/rejected": -2.080108165740967, "logps/chosen": -22.678985595703125, "logps/rejected": -4.560551643371582, "loss": 0.123, "rewards/accuracies": 1.0, "rewards/chosen": 2.5282723903656006, "rewards/margins": 2.03320050239563, "rewards/rejected": 0.4950718879699707, "step": 4736 }, { "epoch": 2.56, "learning_rate": 3.049572300367949e-08, "logits/chosen": -1.9967575073242188, "logits/rejected": -1.9939348697662354, "logps/chosen": -5.1735734939575195, "logps/rejected": -2.0993072986602783, "loss": 0.309, "rewards/accuracies": 1.0, "rewards/chosen": 1.730516791343689, "rewards/margins": 1.0157864093780518, "rewards/rejected": 0.714730441570282, "step": 4737 }, { "epoch": 2.56, "learning_rate": 3.047561704202738e-08, "logits/chosen": -2.0956192016601562, "logits/rejected": -2.2701618671417236, "logps/chosen": -0.3198630213737488, "logps/rejected": -0.2953697443008423, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.9014453291893005, "rewards/margins": 0.01204073429107666, "rewards/rejected": 0.8894045948982239, "step": 4738 }, { "epoch": 2.56, "learning_rate": 3.045551480478057e-08, "logits/chosen": -2.0623185634613037, "logits/rejected": -2.0649356842041016, "logps/chosen": -0.9716633558273315, "logps/rejected": -3.1508708000183105, "loss": 0.4668, "rewards/accuracies": 1.0, "rewards/chosen": 1.0953232049942017, "rewards/margins": 0.519428014755249, "rewards/rejected": 0.5758951902389526, "step": 4739 }, { "epoch": 2.56, "learning_rate": 3.0435416295773676e-08, "logits/chosen": -2.1354660987854004, "logits/rejected": -2.102168083190918, "logps/chosen": -22.771072387695312, "logps/rejected": -15.375529289245605, "loss": 0.2655, "rewards/accuracies": 1.0, "rewards/chosen": 2.2216317653656006, "rewards/margins": 1.1903597116470337, "rewards/rejected": 1.031272053718567, "step": 4740 }, { "epoch": 2.56, "learning_rate": 3.041532151884064e-08, "logits/chosen": -2.133075475692749, "logits/rejected": -2.139698028564453, "logps/chosen": -1.5667526721954346, "logps/rejected": -2.574120283126831, "loss": 0.4096, "rewards/accuracies": 1.0, "rewards/chosen": 1.4864614009857178, "rewards/margins": 0.680898129940033, "rewards/rejected": 0.8055632710456848, "step": 4741 }, { "epoch": 2.56, "learning_rate": 3.039523047781467e-08, "logits/chosen": -2.1564342975616455, "logits/rejected": -2.1509807109832764, "logps/chosen": -2.7367353439331055, "logps/rejected": -6.055850505828857, "loss": 0.306, "rewards/accuracies": 1.0, "rewards/chosen": 1.337692379951477, "rewards/margins": 1.0272234678268433, "rewards/rejected": 0.3104689121246338, "step": 4742 }, { "epoch": 2.56, "learning_rate": 3.037514317652826e-08, "logits/chosen": -2.119995594024658, "logits/rejected": -2.1043550968170166, "logps/chosen": -14.007113456726074, "logps/rejected": -2.0562667846679688, "loss": 0.3424, "rewards/accuracies": 1.0, "rewards/chosen": 1.3827022314071655, "rewards/margins": 0.8955568075180054, "rewards/rejected": 0.48714539408683777, "step": 4743 }, { "epoch": 2.56, "learning_rate": 3.035505961881321e-08, "logits/chosen": -2.036301374435425, "logits/rejected": -2.0436623096466064, "logps/chosen": -0.640368640422821, "logps/rejected": -6.65805196762085, "loss": 0.3802, "rewards/accuracies": 1.0, "rewards/chosen": 0.9921088218688965, "rewards/margins": 0.7708264589309692, "rewards/rejected": 0.22128234803676605, "step": 4744 }, { "epoch": 2.56, "learning_rate": 3.0334979808500575e-08, "logits/chosen": -2.054318428039551, "logits/rejected": -2.29703950881958, "logps/chosen": -0.6586498022079468, "logps/rejected": -0.8174763917922974, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.8640527725219727, "rewards/margins": 0.03463304042816162, "rewards/rejected": 0.829419732093811, "step": 4745 }, { "epoch": 2.56, "learning_rate": 3.0314903749420714e-08, "logits/chosen": -2.1378281116485596, "logits/rejected": -2.130936622619629, "logps/chosen": -11.655864715576172, "logps/rejected": -3.8527369499206543, "loss": 0.3826, "rewards/accuracies": 1.0, "rewards/chosen": 1.2896862030029297, "rewards/margins": 0.7634211182594299, "rewards/rejected": 0.5262650847434998, "step": 4746 }, { "epoch": 2.56, "learning_rate": 3.029483144540328e-08, "logits/chosen": -2.0271074771881104, "logits/rejected": -2.242439031600952, "logps/chosen": -0.5454586744308472, "logps/rejected": -0.5076600313186646, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 1.0082283020019531, "rewards/margins": 0.008525729179382324, "rewards/rejected": 0.9997025728225708, "step": 4747 }, { "epoch": 2.56, "learning_rate": 3.027476290027719e-08, "logits/chosen": -2.143791913986206, "logits/rejected": -2.152118444442749, "logps/chosen": -1.3420277833938599, "logps/rejected": -4.204975128173828, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 1.3643169403076172, "rewards/margins": 0.7860596776008606, "rewards/rejected": 0.5782572627067566, "step": 4748 }, { "epoch": 2.56, "learning_rate": 3.025469811787066e-08, "logits/chosen": -2.0855441093444824, "logits/rejected": -2.2882728576660156, "logps/chosen": -0.31269922852516174, "logps/rejected": -0.3004102110862732, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.8706071972846985, "rewards/margins": 0.009134948253631592, "rewards/rejected": 0.8614722490310669, "step": 4749 }, { "epoch": 2.56, "learning_rate": 3.0234637102011155e-08, "logits/chosen": -2.1155169010162354, "logits/rejected": -2.1234633922576904, "logps/chosen": -15.959830284118652, "logps/rejected": -8.07400894165039, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 2.3066089153289795, "rewards/margins": 1.7574572563171387, "rewards/rejected": 0.549151599407196, "step": 4750 }, { "epoch": 2.56, "learning_rate": 3.0214579856525445e-08, "logits/chosen": -1.9780585765838623, "logits/rejected": -1.9308676719665527, "logps/chosen": -12.199104309082031, "logps/rejected": -11.017536163330078, "loss": 0.2308, "rewards/accuracies": 1.0, "rewards/chosen": 1.493646502494812, "rewards/margins": 1.3486257791519165, "rewards/rejected": 0.14502067863941193, "step": 4751 }, { "epoch": 2.56, "learning_rate": 3.0194526385239625e-08, "logits/chosen": -2.1487929821014404, "logits/rejected": -2.148397922515869, "logps/chosen": -4.825592517852783, "logps/rejected": -2.3831355571746826, "loss": 0.2555, "rewards/accuracies": 1.0, "rewards/chosen": 1.8114467859268188, "rewards/margins": 1.2341090440750122, "rewards/rejected": 0.5773377418518066, "step": 4752 }, { "epoch": 2.56, "learning_rate": 3.0174476691979e-08, "logits/chosen": -2.077105760574341, "logits/rejected": -2.0765016078948975, "logps/chosen": -0.4636152982711792, "logps/rejected": -2.0432863235473633, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 0.9961695075035095, "rewards/margins": 0.19903022050857544, "rewards/rejected": 0.7971392869949341, "step": 4753 }, { "epoch": 2.56, "learning_rate": 3.015443078056819e-08, "logits/chosen": -2.1693782806396484, "logits/rejected": -2.335736036300659, "logps/chosen": -4.972080230712891, "logps/rejected": -3.6342482566833496, "loss": 0.6127, "rewards/accuracies": 1.0, "rewards/chosen": 0.7369621396064758, "rewards/margins": 0.1679384708404541, "rewards/rejected": 0.5690236687660217, "step": 4754 }, { "epoch": 2.56, "learning_rate": 3.013438865483109e-08, "logits/chosen": -2.092822790145874, "logits/rejected": -2.1041347980499268, "logps/chosen": -1.4265546798706055, "logps/rejected": -2.3942654132843018, "loss": 0.4963, "rewards/accuracies": 1.0, "rewards/chosen": 0.9691312909126282, "rewards/margins": 0.4421428442001343, "rewards/rejected": 0.5269884467124939, "step": 4755 }, { "epoch": 2.57, "learning_rate": 3.011435031859084e-08, "logits/chosen": -2.042802333831787, "logits/rejected": -2.290438175201416, "logps/chosen": -0.38538944721221924, "logps/rejected": -0.42368167638778687, "loss": 0.6627, "rewards/accuracies": 1.0, "rewards/chosen": 0.9982559084892273, "rewards/margins": 0.06178641319274902, "rewards/rejected": 0.9364694952964783, "step": 4756 }, { "epoch": 2.57, "learning_rate": 3.009431577566991e-08, "logits/chosen": -2.0385704040527344, "logits/rejected": -2.036076545715332, "logps/chosen": -0.6471455097198486, "logps/rejected": -4.325921058654785, "loss": 0.4905, "rewards/accuracies": 1.0, "rewards/chosen": 0.9655088782310486, "rewards/margins": 0.456998348236084, "rewards/rejected": 0.5085105299949646, "step": 4757 }, { "epoch": 2.57, "learning_rate": 3.007428502989001e-08, "logits/chosen": -2.096583843231201, "logits/rejected": -2.298994779586792, "logps/chosen": -0.23348307609558105, "logps/rejected": -0.21331506967544556, "loss": 0.6718, "rewards/accuracies": 1.0, "rewards/chosen": 0.886132538318634, "rewards/margins": 0.04324758052825928, "rewards/rejected": 0.8428849577903748, "step": 4758 }, { "epoch": 2.57, "learning_rate": 3.0054258085072146e-08, "logits/chosen": -2.0441324710845947, "logits/rejected": -2.250965118408203, "logps/chosen": -0.22602051496505737, "logps/rejected": -0.2259693592786789, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.9650539755821228, "rewards/margins": -0.003052949905395508, "rewards/rejected": 0.9681069254875183, "step": 4759 }, { "epoch": 2.57, "learning_rate": 3.003423494503658e-08, "logits/chosen": -2.0222480297088623, "logits/rejected": -2.025063991546631, "logps/chosen": -2.387148141860962, "logps/rejected": -3.217705726623535, "loss": 0.4651, "rewards/accuracies": 1.0, "rewards/chosen": 1.4472547769546509, "rewards/margins": 0.5238438844680786, "rewards/rejected": 0.9234108924865723, "step": 4760 }, { "epoch": 2.57, "learning_rate": 3.0014215613602875e-08, "logits/chosen": -2.032942295074463, "logits/rejected": -2.0266220569610596, "logps/chosen": -2.4883456230163574, "logps/rejected": -3.6710097789764404, "loss": 0.6125, "rewards/accuracies": 1.0, "rewards/chosen": 1.1341419219970703, "rewards/margins": 0.16838747262954712, "rewards/rejected": 0.9657544493675232, "step": 4761 }, { "epoch": 2.57, "learning_rate": 2.999420009458984e-08, "logits/chosen": -2.0542428493499756, "logits/rejected": -2.297091484069824, "logps/chosen": -0.17120224237442017, "logps/rejected": -0.18565312027931213, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.9079192280769348, "rewards/margins": 0.020696043968200684, "rewards/rejected": 0.8872231841087341, "step": 4762 }, { "epoch": 2.57, "learning_rate": 2.997418839181556e-08, "logits/chosen": -2.064654588699341, "logits/rejected": -2.05690860748291, "logps/chosen": -3.8097283840179443, "logps/rejected": -3.5298757553100586, "loss": 0.397, "rewards/accuracies": 1.0, "rewards/chosen": 1.829553484916687, "rewards/margins": 0.7188600301742554, "rewards/rejected": 1.1106934547424316, "step": 4763 }, { "epoch": 2.57, "learning_rate": 2.9954180509097416e-08, "logits/chosen": -2.1307425498962402, "logits/rejected": -2.125584602355957, "logps/chosen": -4.7583417892456055, "logps/rejected": -6.947039604187012, "loss": 0.3102, "rewards/accuracies": 1.0, "rewards/chosen": 1.4933125972747803, "rewards/margins": 1.0113403797149658, "rewards/rejected": 0.48197221755981445, "step": 4764 }, { "epoch": 2.57, "learning_rate": 2.9934176450252035e-08, "logits/chosen": -2.167769193649292, "logits/rejected": -2.2224295139312744, "logps/chosen": -5.54927921295166, "logps/rejected": -7.7983479499816895, "loss": 0.5383, "rewards/accuracies": 1.0, "rewards/chosen": 1.272910475730896, "rewards/margins": 0.33826082944869995, "rewards/rejected": 0.934649646282196, "step": 4765 }, { "epoch": 2.57, "learning_rate": 2.9914176219095324e-08, "logits/chosen": -2.1762077808380127, "logits/rejected": -2.1151082515716553, "logps/chosen": -18.349201202392578, "logps/rejected": -4.236378192901611, "loss": 0.1403, "rewards/accuracies": 1.0, "rewards/chosen": 2.5666699409484863, "rewards/margins": 1.893139362335205, "rewards/rejected": 0.6735305190086365, "step": 4766 }, { "epoch": 2.57, "learning_rate": 2.989417981944247e-08, "logits/chosen": -2.198040008544922, "logits/rejected": -2.1095316410064697, "logps/chosen": -32.196144104003906, "logps/rejected": -5.263097763061523, "loss": 0.1689, "rewards/accuracies": 1.0, "rewards/chosen": 2.059539794921875, "rewards/margins": 1.6925792694091797, "rewards/rejected": 0.3669605255126953, "step": 4767 }, { "epoch": 2.57, "learning_rate": 2.98741872551079e-08, "logits/chosen": -1.9930421113967896, "logits/rejected": -1.9898279905319214, "logps/chosen": -9.317768096923828, "logps/rejected": -1.860701084136963, "loss": 0.4569, "rewards/accuracies": 1.0, "rewards/chosen": 1.3633531332015991, "rewards/margins": 0.5462266802787781, "rewards/rejected": 0.817126452922821, "step": 4768 }, { "epoch": 2.57, "learning_rate": 2.985419852990534e-08, "logits/chosen": -2.1198630332946777, "logits/rejected": -2.1294384002685547, "logps/chosen": -4.195800304412842, "logps/rejected": -7.6333441734313965, "loss": 0.3015, "rewards/accuracies": 1.0, "rewards/chosen": 1.5103195905685425, "rewards/margins": 1.0445533990859985, "rewards/rejected": 0.46576619148254395, "step": 4769 }, { "epoch": 2.57, "learning_rate": 2.9834213647647763e-08, "logits/chosen": -2.080077886581421, "logits/rejected": -2.069445848464966, "logps/chosen": -3.9542396068573, "logps/rejected": -3.62100887298584, "loss": 0.4764, "rewards/accuracies": 1.0, "rewards/chosen": 1.1524168252944946, "rewards/margins": 0.49391233921051025, "rewards/rejected": 0.6585044860839844, "step": 4770 }, { "epoch": 2.57, "learning_rate": 2.981423261214744e-08, "logits/chosen": -2.0924439430236816, "logits/rejected": -2.2539172172546387, "logps/chosen": -0.5735936760902405, "logps/rejected": -0.5644332766532898, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.8575679659843445, "rewards/margins": 0.008858263492584229, "rewards/rejected": 0.8487097024917603, "step": 4771 }, { "epoch": 2.57, "learning_rate": 2.9794255427215864e-08, "logits/chosen": -2.0398731231689453, "logits/rejected": -2.235368013381958, "logps/chosen": -0.4754396378993988, "logps/rejected": -0.4665120244026184, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9662925004959106, "rewards/margins": 0.02046889066696167, "rewards/rejected": 0.945823609828949, "step": 4772 }, { "epoch": 2.57, "learning_rate": 2.9774282096663817e-08, "logits/chosen": -1.9849992990493774, "logits/rejected": -2.001610517501831, "logps/chosen": -4.637523651123047, "logps/rejected": -7.334379196166992, "loss": 0.5226, "rewards/accuracies": 1.0, "rewards/chosen": 1.0890520811080933, "rewards/margins": 0.3763043284416199, "rewards/rejected": 0.7127477526664734, "step": 4773 }, { "epoch": 2.57, "learning_rate": 2.9754312624301332e-08, "logits/chosen": -1.9994367361068726, "logits/rejected": -2.2932393550872803, "logps/chosen": -0.6724425554275513, "logps/rejected": -0.7660113573074341, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 1.0197553634643555, "rewards/margins": 0.005092978477478027, "rewards/rejected": 1.0146623849868774, "step": 4774 }, { "epoch": 2.58, "learning_rate": 2.973434701393775e-08, "logits/chosen": -2.087937831878662, "logits/rejected": -2.0811519622802734, "logps/chosen": -6.024500846862793, "logps/rejected": -4.578984260559082, "loss": 0.4133, "rewards/accuracies": 1.0, "rewards/chosen": 1.260581135749817, "rewards/margins": 0.6697999835014343, "rewards/rejected": 0.5907811522483826, "step": 4775 }, { "epoch": 2.58, "learning_rate": 2.9714385269381624e-08, "logits/chosen": -2.167462110519409, "logits/rejected": -2.3296265602111816, "logps/chosen": -4.686637878417969, "logps/rejected": -0.811720609664917, "loss": 0.7257, "rewards/accuracies": 0.0, "rewards/chosen": 0.948624312877655, "rewards/margins": -0.0640607476234436, "rewards/rejected": 1.0126850605010986, "step": 4776 }, { "epoch": 2.58, "learning_rate": 2.9694427394440802e-08, "logits/chosen": -2.142298460006714, "logits/rejected": -2.1490962505340576, "logps/chosen": -2.915339469909668, "logps/rejected": -16.251819610595703, "loss": 0.2901, "rewards/accuracies": 1.0, "rewards/chosen": 1.096859335899353, "rewards/margins": 1.0890922546386719, "rewards/rejected": 0.007767105009406805, "step": 4777 }, { "epoch": 2.58, "learning_rate": 2.9674473392922362e-08, "logits/chosen": -2.0740582942962646, "logits/rejected": -2.118177890777588, "logps/chosen": -2.7498373985290527, "logps/rejected": -9.580751419067383, "loss": 0.3416, "rewards/accuracies": 1.0, "rewards/chosen": 1.5649044513702393, "rewards/margins": 0.898345410823822, "rewards/rejected": 0.6665590405464172, "step": 4778 }, { "epoch": 2.58, "learning_rate": 2.965452326863268e-08, "logits/chosen": -2.142547130584717, "logits/rejected": -2.153964042663574, "logps/chosen": -2.0919265747070312, "logps/rejected": -3.5630276203155518, "loss": 0.4559, "rewards/accuracies": 1.0, "rewards/chosen": 1.4552528858184814, "rewards/margins": 0.5488811135292053, "rewards/rejected": 0.9063717722892761, "step": 4779 }, { "epoch": 2.58, "learning_rate": 2.963457702537736e-08, "logits/chosen": -2.1713180541992188, "logits/rejected": -2.310040235519409, "logps/chosen": -1.9757401943206787, "logps/rejected": -1.9663524627685547, "loss": 0.7034, "rewards/accuracies": 0.0, "rewards/chosen": 0.982166588306427, "rewards/margins": -0.020358264446258545, "rewards/rejected": 1.0025248527526855, "step": 4780 }, { "epoch": 2.58, "learning_rate": 2.9614634666961287e-08, "logits/chosen": -2.0679407119750977, "logits/rejected": -2.066554546356201, "logps/chosen": -0.7275574803352356, "logps/rejected": -2.0020949840545654, "loss": 0.5919, "rewards/accuracies": 1.0, "rewards/chosen": 1.018093466758728, "rewards/margins": 0.21388089656829834, "rewards/rejected": 0.8042125701904297, "step": 4781 }, { "epoch": 2.58, "learning_rate": 2.9594696197188595e-08, "logits/chosen": -2.180159330368042, "logits/rejected": -2.1805202960968018, "logps/chosen": -0.7784157395362854, "logps/rejected": -4.94786262512207, "loss": 0.4104, "rewards/accuracies": 1.0, "rewards/chosen": 1.0799827575683594, "rewards/margins": 0.6783319711685181, "rewards/rejected": 0.4016508162021637, "step": 4782 }, { "epoch": 2.58, "learning_rate": 2.9574761619862676e-08, "logits/chosen": -2.0781478881835938, "logits/rejected": -2.0880579948425293, "logps/chosen": -1.7433596849441528, "logps/rejected": -2.333411931991577, "loss": 0.4767, "rewards/accuracies": 1.0, "rewards/chosen": 1.174484372138977, "rewards/margins": 0.4930926561355591, "rewards/rejected": 0.681391716003418, "step": 4783 }, { "epoch": 2.58, "learning_rate": 2.955483093878619e-08, "logits/chosen": -2.1722190380096436, "logits/rejected": -2.3238139152526855, "logps/chosen": -0.40264564752578735, "logps/rejected": -0.43156880140304565, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 0.9985125660896301, "rewards/margins": -0.014380872249603271, "rewards/rejected": 1.0128934383392334, "step": 4784 }, { "epoch": 2.58, "learning_rate": 2.953490415776103e-08, "logits/chosen": -2.139660596847534, "logits/rejected": -2.3551547527313232, "logps/chosen": -0.2828226089477539, "logps/rejected": -0.2837299704551697, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8528017997741699, "rewards/margins": 0.020951926708221436, "rewards/rejected": 0.8318498730659485, "step": 4785 }, { "epoch": 2.58, "learning_rate": 2.9514981280588386e-08, "logits/chosen": -2.1200785636901855, "logits/rejected": -2.2829067707061768, "logps/chosen": -0.18601834774017334, "logps/rejected": -0.2035643756389618, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 1.0067652463912964, "rewards/margins": 0.02458876371383667, "rewards/rejected": 0.9821764826774597, "step": 4786 }, { "epoch": 2.58, "learning_rate": 2.9495062311068665e-08, "logits/chosen": -2.158386468887329, "logits/rejected": -2.319833278656006, "logps/chosen": -1.2938697338104248, "logps/rejected": -3.1406021118164062, "loss": 0.7419, "rewards/accuracies": 0.0, "rewards/chosen": 0.8778077960014343, "rewards/margins": -0.09517109394073486, "rewards/rejected": 0.9729788899421692, "step": 4787 }, { "epoch": 2.58, "learning_rate": 2.9475147253001546e-08, "logits/chosen": -2.0460596084594727, "logits/rejected": -2.291585922241211, "logps/chosen": -9.365212440490723, "logps/rejected": -9.841897964477539, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 1.142868161201477, "rewards/margins": 0.03594660758972168, "rewards/rejected": 1.1069215536117554, "step": 4788 }, { "epoch": 2.58, "learning_rate": 2.9455236110185954e-08, "logits/chosen": -1.9975816011428833, "logits/rejected": -2.2360739707946777, "logps/chosen": -0.45629313588142395, "logps/rejected": -0.4694162607192993, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.9087390899658203, "rewards/margins": 0.023083865642547607, "rewards/rejected": 0.8856552243232727, "step": 4789 }, { "epoch": 2.58, "learning_rate": 2.943532888642008e-08, "logits/chosen": -2.058030605316162, "logits/rejected": -2.0902700424194336, "logps/chosen": -3.8969826698303223, "logps/rejected": -3.7697691917419434, "loss": 0.5913, "rewards/accuracies": 1.0, "rewards/chosen": 1.2017983198165894, "rewards/margins": 0.21535193920135498, "rewards/rejected": 0.9864463806152344, "step": 4790 }, { "epoch": 2.58, "learning_rate": 2.9415425585501353e-08, "logits/chosen": -2.071214437484741, "logits/rejected": -2.2408533096313477, "logps/chosen": -1.7215259075164795, "logps/rejected": -1.7450569868087769, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.6190213561058044, "rewards/margins": 0.019565999507904053, "rewards/rejected": 0.5994553565979004, "step": 4791 }, { "epoch": 2.58, "learning_rate": 2.939552621122645e-08, "logits/chosen": -2.0399699211120605, "logits/rejected": -2.039868116378784, "logps/chosen": -1.1140549182891846, "logps/rejected": -2.547483205795288, "loss": 0.5596, "rewards/accuracies": 1.0, "rewards/chosen": 1.1067382097244263, "rewards/margins": 0.2876415252685547, "rewards/rejected": 0.8190966844558716, "step": 4792 }, { "epoch": 2.59, "learning_rate": 2.9375630767391325e-08, "logits/chosen": -1.9757790565490723, "logits/rejected": -2.300431728363037, "logps/chosen": -3.2145838737487793, "logps/rejected": -1.50130295753479, "loss": 0.7934, "rewards/accuracies": 0.0, "rewards/chosen": 0.6117972135543823, "rewards/margins": -0.19137108325958252, "rewards/rejected": 0.8031682968139648, "step": 4793 }, { "epoch": 2.59, "learning_rate": 2.9355739257791157e-08, "logits/chosen": -2.0068631172180176, "logits/rejected": -2.2495083808898926, "logps/chosen": -0.2662656307220459, "logps/rejected": -0.32330501079559326, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 1.0025959014892578, "rewards/margins": 0.023106276988983154, "rewards/rejected": 0.9794896245002747, "step": 4794 }, { "epoch": 2.59, "learning_rate": 2.933585168622038e-08, "logits/chosen": -1.9982028007507324, "logits/rejected": -2.006269931793213, "logps/chosen": -5.850688934326172, "logps/rejected": -1.0763906240463257, "loss": 0.2471, "rewards/accuracies": 1.0, "rewards/chosen": 1.8419822454452515, "rewards/margins": 1.2717738151550293, "rewards/rejected": 0.5702084898948669, "step": 4795 }, { "epoch": 2.59, "learning_rate": 2.931596805647267e-08, "logits/chosen": -2.044391632080078, "logits/rejected": -2.0385079383850098, "logps/chosen": -5.46235466003418, "logps/rejected": -4.120204448699951, "loss": 0.3043, "rewards/accuracies": 1.0, "rewards/chosen": 1.5160791873931885, "rewards/margins": 1.0336902141571045, "rewards/rejected": 0.4823889434337616, "step": 4796 }, { "epoch": 2.59, "learning_rate": 2.9296088372340998e-08, "logits/chosen": -2.0287601947784424, "logits/rejected": -2.0098583698272705, "logps/chosen": -9.75178337097168, "logps/rejected": -11.208356857299805, "loss": 0.4794, "rewards/accuracies": 1.0, "rewards/chosen": 1.2200660705566406, "rewards/margins": 0.4859500527381897, "rewards/rejected": 0.7341160178184509, "step": 4797 }, { "epoch": 2.59, "learning_rate": 2.9276212637617516e-08, "logits/chosen": -2.133906364440918, "logits/rejected": -2.3837244510650635, "logps/chosen": -0.5952667593955994, "logps/rejected": -9.940162658691406, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 1.0583564043045044, "rewards/margins": 0.034480929374694824, "rewards/rejected": 1.0238754749298096, "step": 4798 }, { "epoch": 2.59, "learning_rate": 2.9256340856093652e-08, "logits/chosen": -2.1492607593536377, "logits/rejected": -2.064136505126953, "logps/chosen": -5.446069717407227, "logps/rejected": -2.1017351150512695, "loss": 0.3404, "rewards/accuracies": 1.0, "rewards/chosen": 1.7597709894180298, "rewards/margins": 0.9024556279182434, "rewards/rejected": 0.8573153614997864, "step": 4799 }, { "epoch": 2.59, "learning_rate": 2.9236473031560093e-08, "logits/chosen": -2.178940773010254, "logits/rejected": -2.145364999771118, "logps/chosen": -20.44237518310547, "logps/rejected": -5.135746479034424, "loss": 0.2239, "rewards/accuracies": 1.0, "rewards/chosen": 1.8393245935440063, "rewards/margins": 1.3826693296432495, "rewards/rejected": 0.45665526390075684, "step": 4800 }, { "epoch": 2.59, "learning_rate": 2.9216609167806757e-08, "logits/chosen": -1.9878501892089844, "logits/rejected": -2.2431440353393555, "logps/chosen": -0.1648932844400406, "logps/rejected": -0.18627426028251648, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9445281028747559, "rewards/margins": 0.03212535381317139, "rewards/rejected": 0.9124027490615845, "step": 4801 }, { "epoch": 2.59, "learning_rate": 2.91967492686228e-08, "logits/chosen": -2.1707913875579834, "logits/rejected": -2.2044990062713623, "logps/chosen": -2.8701345920562744, "logps/rejected": -11.210182189941406, "loss": 0.4875, "rewards/accuracies": 1.0, "rewards/chosen": 1.2740408182144165, "rewards/margins": 0.46491187810897827, "rewards/rejected": 0.8091289401054382, "step": 4802 }, { "epoch": 2.59, "learning_rate": 2.9176893337796616e-08, "logits/chosen": -2.0708773136138916, "logits/rejected": -2.3608930110931396, "logps/chosen": -0.23905836045742035, "logps/rejected": -0.2732274830341339, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9071154594421387, "rewards/margins": 0.016483187675476074, "rewards/rejected": 0.8906322717666626, "step": 4803 }, { "epoch": 2.59, "learning_rate": 2.915704137911588e-08, "logits/chosen": -2.1364998817443848, "logits/rejected": -2.3509092330932617, "logps/chosen": -0.2360658198595047, "logps/rejected": -0.2754072844982147, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.8655053377151489, "rewards/margins": 0.025139033794403076, "rewards/rejected": 0.8403663039207458, "step": 4804 }, { "epoch": 2.59, "learning_rate": 2.9137193396367476e-08, "logits/chosen": -2.1188948154449463, "logits/rejected": -2.129183530807495, "logps/chosen": -1.5457828044891357, "logps/rejected": -4.3818230628967285, "loss": 0.4156, "rewards/accuracies": 1.0, "rewards/chosen": 1.3560807704925537, "rewards/margins": 0.6631420850753784, "rewards/rejected": 0.6929386854171753, "step": 4805 }, { "epoch": 2.59, "learning_rate": 2.9117349393337537e-08, "logits/chosen": -2.143427848815918, "logits/rejected": -2.300495147705078, "logps/chosen": -0.5903180837631226, "logps/rejected": -0.5714014172554016, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 1.0316699743270874, "rewards/margins": 0.019518375396728516, "rewards/rejected": 1.0121515989303589, "step": 4806 }, { "epoch": 2.59, "learning_rate": 2.9097509373811435e-08, "logits/chosen": -2.108196973800659, "logits/rejected": -2.109076499938965, "logps/chosen": -0.5499268770217896, "logps/rejected": -5.115969181060791, "loss": 0.4151, "rewards/accuracies": 1.0, "rewards/chosen": 1.0927207469940186, "rewards/margins": 0.6645679473876953, "rewards/rejected": 0.42815279960632324, "step": 4807 }, { "epoch": 2.59, "learning_rate": 2.9077673341573787e-08, "logits/chosen": -2.1155552864074707, "logits/rejected": -2.1304681301116943, "logps/chosen": -3.134129047393799, "logps/rejected": -4.72371768951416, "loss": 0.2602, "rewards/accuracies": 1.0, "rewards/chosen": 1.9147474765777588, "rewards/margins": 1.2132163047790527, "rewards/rejected": 0.7015311121940613, "step": 4808 }, { "epoch": 2.59, "learning_rate": 2.9057841300408454e-08, "logits/chosen": -2.1150755882263184, "logits/rejected": -2.254470109939575, "logps/chosen": -10.75905990600586, "logps/rejected": -0.4944472312927246, "loss": 0.8018, "rewards/accuracies": 0.0, "rewards/chosen": 0.7098398208618164, "rewards/margins": -0.20671314001083374, "rewards/rejected": 0.9165529608726501, "step": 4809 }, { "epoch": 2.59, "learning_rate": 2.903801325409852e-08, "logits/chosen": -2.118119239807129, "logits/rejected": -2.3593735694885254, "logps/chosen": -0.37469881772994995, "logps/rejected": -0.3818832039833069, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491879343986511, "rewards/margins": 0.0054225921630859375, "rewards/rejected": 0.9437653422355652, "step": 4810 }, { "epoch": 2.59, "learning_rate": 2.901818920642632e-08, "logits/chosen": -2.2432053089141846, "logits/rejected": -2.330101490020752, "logps/chosen": -0.16380584239959717, "logps/rejected": -0.13107143342494965, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.8121799826622009, "rewards/margins": 0.04041588306427002, "rewards/rejected": 0.7717640995979309, "step": 4811 }, { "epoch": 2.6, "learning_rate": 2.899836916117342e-08, "logits/chosen": -2.148001194000244, "logits/rejected": -2.150348663330078, "logps/chosen": -0.930757462978363, "logps/rejected": -4.146304607391357, "loss": 0.446, "rewards/accuracies": 1.0, "rewards/chosen": 1.0210222005844116, "rewards/margins": 0.576240062713623, "rewards/rejected": 0.4447821080684662, "step": 4812 }, { "epoch": 2.6, "learning_rate": 2.8978553122120632e-08, "logits/chosen": -2.100942850112915, "logits/rejected": -2.100930690765381, "logps/chosen": -1.472151517868042, "logps/rejected": -2.177478790283203, "loss": 0.5857, "rewards/accuracies": 1.0, "rewards/chosen": 1.1046870946884155, "rewards/margins": 0.2279266119003296, "rewards/rejected": 0.8767604827880859, "step": 4813 }, { "epoch": 2.6, "learning_rate": 2.8958741093047984e-08, "logits/chosen": -2.1278321743011475, "logits/rejected": -2.12916898727417, "logps/chosen": -1.1802453994750977, "logps/rejected": -2.2400901317596436, "loss": 0.4987, "rewards/accuracies": 1.0, "rewards/chosen": 1.0945746898651123, "rewards/margins": 0.4360700845718384, "rewards/rejected": 0.6585046052932739, "step": 4814 }, { "epoch": 2.6, "learning_rate": 2.8938933077734762e-08, "logits/chosen": -2.0649213790893555, "logits/rejected": -2.0654280185699463, "logps/chosen": -5.291265487670898, "logps/rejected": -5.294325351715088, "loss": 0.4384, "rewards/accuracies": 1.0, "rewards/chosen": 1.1897814273834229, "rewards/margins": 0.5973402857780457, "rewards/rejected": 0.5924411416053772, "step": 4815 }, { "epoch": 2.6, "learning_rate": 2.891912907995947e-08, "logits/chosen": -2.1047465801239014, "logits/rejected": -2.106570243835449, "logps/chosen": -1.0965993404388428, "logps/rejected": -6.217568397521973, "loss": 0.3946, "rewards/accuracies": 1.0, "rewards/chosen": 1.0399887561798096, "rewards/margins": 0.7259678244590759, "rewards/rejected": 0.31402093172073364, "step": 4816 }, { "epoch": 2.6, "learning_rate": 2.8899329103499858e-08, "logits/chosen": -2.0323166847229004, "logits/rejected": -2.3017897605895996, "logps/chosen": -0.1055154874920845, "logps/rejected": -0.1284477412700653, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8717440962791443, "rewards/margins": 0.019818902015686035, "rewards/rejected": 0.8519251942634583, "step": 4817 }, { "epoch": 2.6, "learning_rate": 2.8879533152132902e-08, "logits/chosen": -2.237661361694336, "logits/rejected": -2.3668620586395264, "logps/chosen": -12.806988716125488, "logps/rejected": -8.72939682006836, "loss": 0.6553, "rewards/accuracies": 1.0, "rewards/chosen": 1.0678423643112183, "rewards/margins": 0.07722419500350952, "rewards/rejected": 0.9906181693077087, "step": 4818 }, { "epoch": 2.6, "learning_rate": 2.8859741229634783e-08, "logits/chosen": -2.0642576217651367, "logits/rejected": -2.323385715484619, "logps/chosen": -0.18449586629867554, "logps/rejected": -0.2445589303970337, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9038130044937134, "rewards/margins": 0.012390434741973877, "rewards/rejected": 0.8914225697517395, "step": 4819 }, { "epoch": 2.6, "learning_rate": 2.883995333978101e-08, "logits/chosen": -2.119385242462158, "logits/rejected": -2.2779381275177, "logps/chosen": -1.3556694984436035, "logps/rejected": -1.4978548288345337, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 1.0338724851608276, "rewards/margins": 0.025938987731933594, "rewards/rejected": 1.007933497428894, "step": 4820 }, { "epoch": 2.6, "learning_rate": 2.882016948634619e-08, "logits/chosen": -2.0126068592071533, "logits/rejected": -2.310762643814087, "logps/chosen": -0.30012959241867065, "logps/rejected": -0.39107394218444824, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 1.0127472877502441, "rewards/margins": 0.024371683597564697, "rewards/rejected": 0.9883756041526794, "step": 4821 }, { "epoch": 2.6, "learning_rate": 2.8800389673104242e-08, "logits/chosen": -2.0436291694641113, "logits/rejected": -2.2770581245422363, "logps/chosen": -1.9173423051834106, "logps/rejected": -2.043532371520996, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.800477147102356, "rewards/margins": 0.004814088344573975, "rewards/rejected": 0.795663058757782, "step": 4822 }, { "epoch": 2.6, "learning_rate": 2.87806139038283e-08, "logits/chosen": -2.017906904220581, "logits/rejected": -2.009291887283325, "logps/chosen": -4.655336380004883, "logps/rejected": -0.935242772102356, "loss": 0.4808, "rewards/accuracies": 1.0, "rewards/chosen": 1.259554147720337, "rewards/margins": 0.4822297692298889, "rewards/rejected": 0.777324378490448, "step": 4823 }, { "epoch": 2.6, "learning_rate": 2.8760842182290716e-08, "logits/chosen": -2.1043777465820312, "logits/rejected": -2.3227579593658447, "logps/chosen": -0.18409079313278198, "logps/rejected": -0.24026691913604736, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.8948189616203308, "rewards/margins": 0.03546428680419922, "rewards/rejected": 0.8593546748161316, "step": 4824 }, { "epoch": 2.6, "learning_rate": 2.8741074512263086e-08, "logits/chosen": -1.9942413568496704, "logits/rejected": -1.9976329803466797, "logps/chosen": -0.16556298732757568, "logps/rejected": -3.0022926330566406, "loss": 0.4998, "rewards/accuracies": 1.0, "rewards/chosen": 0.9160105586051941, "rewards/margins": 0.43334707617759705, "rewards/rejected": 0.48266348242759705, "step": 4825 }, { "epoch": 2.6, "learning_rate": 2.8721310897516226e-08, "logits/chosen": -2.0837268829345703, "logits/rejected": -2.290720224380493, "logps/chosen": -2.4442696571350098, "logps/rejected": -2.367257595062256, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.7338669300079346, "rewards/margins": 0.013057589530944824, "rewards/rejected": 0.7208093404769897, "step": 4826 }, { "epoch": 2.6, "learning_rate": 2.8701551341820163e-08, "logits/chosen": -1.9478124380111694, "logits/rejected": -2.2498228549957275, "logps/chosen": -3.007312059402466, "logps/rejected": -1.8948355913162231, "loss": 0.7552, "rewards/accuracies": 0.0, "rewards/chosen": 0.7453028559684753, "rewards/margins": -0.12044203281402588, "rewards/rejected": 0.8657448887825012, "step": 4827 }, { "epoch": 2.6, "learning_rate": 2.868179584894418e-08, "logits/chosen": -2.0662384033203125, "logits/rejected": -2.3035271167755127, "logps/chosen": -2.6614198684692383, "logps/rejected": -2.7462987899780273, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.7192186713218689, "rewards/margins": 0.023394107818603516, "rewards/rejected": 0.6958245635032654, "step": 4828 }, { "epoch": 2.6, "learning_rate": 2.866204442265675e-08, "logits/chosen": -2.077974557876587, "logits/rejected": -2.0753684043884277, "logps/chosen": -0.21126404404640198, "logps/rejected": -6.013174533843994, "loss": 0.4881, "rewards/accuracies": 1.0, "rewards/chosen": 1.0555206537246704, "rewards/margins": 0.46334928274154663, "rewards/rejected": 0.5921713709831238, "step": 4829 }, { "epoch": 2.61, "learning_rate": 2.8642297066725574e-08, "logits/chosen": -2.161142110824585, "logits/rejected": -2.3550312519073486, "logps/chosen": -0.6603274941444397, "logps/rejected": -5.373374938964844, "loss": 0.6274, "rewards/accuracies": 1.0, "rewards/chosen": 0.9471961259841919, "rewards/margins": 0.1361536979675293, "rewards/rejected": 0.8110424280166626, "step": 4830 }, { "epoch": 2.61, "learning_rate": 2.862255378491764e-08, "logits/chosen": -2.042715072631836, "logits/rejected": -2.0552961826324463, "logps/chosen": -6.932294845581055, "logps/rejected": -3.388542652130127, "loss": 0.3342, "rewards/accuracies": 1.0, "rewards/chosen": 1.6723722219467163, "rewards/margins": 0.92426997423172, "rewards/rejected": 0.7481022477149963, "step": 4831 }, { "epoch": 2.61, "learning_rate": 2.860281458099908e-08, "logits/chosen": -2.049682378768921, "logits/rejected": -2.300720691680908, "logps/chosen": -4.978431701660156, "logps/rejected": -5.912936210632324, "loss": 0.6405, "rewards/accuracies": 1.0, "rewards/chosen": 0.6697413325309753, "rewards/margins": 0.10826021432876587, "rewards/rejected": 0.5614811182022095, "step": 4832 }, { "epoch": 2.61, "learning_rate": 2.858307945873528e-08, "logits/chosen": -2.1315765380859375, "logits/rejected": -2.0093579292297363, "logps/chosen": -13.387849807739258, "logps/rejected": -10.869339942932129, "loss": 0.3052, "rewards/accuracies": 1.0, "rewards/chosen": 1.9040911197662354, "rewards/margins": 1.0301508903503418, "rewards/rejected": 0.8739401698112488, "step": 4833 }, { "epoch": 2.61, "learning_rate": 2.8563348421890853e-08, "logits/chosen": -1.9669400453567505, "logits/rejected": -1.9367923736572266, "logps/chosen": -8.877681732177734, "logps/rejected": -4.015738010406494, "loss": 0.2453, "rewards/accuracies": 1.0, "rewards/chosen": 1.7310088872909546, "rewards/margins": 1.2800384759902954, "rewards/rejected": 0.4509704113006592, "step": 4834 }, { "epoch": 2.61, "learning_rate": 2.854362147422962e-08, "logits/chosen": -1.9690052270889282, "logits/rejected": -2.2384724617004395, "logps/chosen": -1.968900442123413, "logps/rejected": -1.978698492050171, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.77430659532547, "rewards/margins": 0.030806660652160645, "rewards/rejected": 0.7434999346733093, "step": 4835 }, { "epoch": 2.61, "learning_rate": 2.852389861951462e-08, "logits/chosen": -2.0477030277252197, "logits/rejected": -2.286874771118164, "logps/chosen": -0.34593793749809265, "logps/rejected": -0.2750038504600525, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.9619145393371582, "rewards/margins": 0.02707010507583618, "rewards/rejected": 0.934844434261322, "step": 4836 }, { "epoch": 2.61, "learning_rate": 2.8504179861508137e-08, "logits/chosen": -2.1266727447509766, "logits/rejected": -2.246826410293579, "logps/chosen": -3.3631057739257812, "logps/rejected": -4.196793556213379, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.7610565423965454, "rewards/margins": -0.011996269226074219, "rewards/rejected": 0.7730528116226196, "step": 4837 }, { "epoch": 2.61, "learning_rate": 2.848446520397163e-08, "logits/chosen": -2.1614701747894287, "logits/rejected": -2.0540659427642822, "logps/chosen": -20.31787872314453, "logps/rejected": -1.2249655723571777, "loss": 0.2326, "rewards/accuracies": 1.0, "rewards/chosen": 2.2036476135253906, "rewards/margins": 1.3400957584381104, "rewards/rejected": 0.863551914691925, "step": 4838 }, { "epoch": 2.61, "learning_rate": 2.8464754650665813e-08, "logits/chosen": -2.1487109661102295, "logits/rejected": -2.0587360858917236, "logps/chosen": -12.966146469116211, "logps/rejected": -5.960186004638672, "loss": 0.2089, "rewards/accuracies": 1.0, "rewards/chosen": 1.8388937711715698, "rewards/margins": 1.4598662853240967, "rewards/rejected": 0.37902745604515076, "step": 4839 }, { "epoch": 2.61, "learning_rate": 2.8445048205350598e-08, "logits/chosen": -2.1405134201049805, "logits/rejected": -2.3580269813537598, "logps/chosen": -0.4463900327682495, "logps/rejected": -0.4479065537452698, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 1.0212770700454712, "rewards/margins": 0.010227799415588379, "rewards/rejected": 1.0110492706298828, "step": 4840 }, { "epoch": 2.61, "learning_rate": 2.8425345871785102e-08, "logits/chosen": -2.179899215698242, "logits/rejected": -2.174370765686035, "logps/chosen": -3.8531975746154785, "logps/rejected": -5.372679233551025, "loss": 0.5686, "rewards/accuracies": 1.0, "rewards/chosen": 0.8666641116142273, "rewards/margins": 0.2668468952178955, "rewards/rejected": 0.5998172163963318, "step": 4841 }, { "epoch": 2.61, "learning_rate": 2.8405647653727727e-08, "logits/chosen": -2.0074732303619385, "logits/rejected": -2.0065178871154785, "logps/chosen": -1.807143211364746, "logps/rejected": -8.96796989440918, "loss": 0.272, "rewards/accuracies": 1.0, "rewards/chosen": 1.5070964097976685, "rewards/margins": 1.162934422492981, "rewards/rejected": 0.3441619873046875, "step": 4842 }, { "epoch": 2.61, "learning_rate": 2.8385953554936003e-08, "logits/chosen": -1.9418025016784668, "logits/rejected": -1.9769171476364136, "logps/chosen": -0.6350212097167969, "logps/rejected": -13.105152130126953, "loss": 0.5368, "rewards/accuracies": 1.0, "rewards/chosen": 1.0795310735702515, "rewards/margins": 0.3417481780052185, "rewards/rejected": 0.737782895565033, "step": 4843 }, { "epoch": 2.61, "learning_rate": 2.8366263579166717e-08, "logits/chosen": -2.088290214538574, "logits/rejected": -2.08918833732605, "logps/chosen": -0.341174840927124, "logps/rejected": -4.540264129638672, "loss": 0.4509, "rewards/accuracies": 1.0, "rewards/chosen": 0.9941201210021973, "rewards/margins": 0.5624957084655762, "rewards/rejected": 0.4316244125366211, "step": 4844 }, { "epoch": 2.61, "learning_rate": 2.8346577730175865e-08, "logits/chosen": -2.2294492721557617, "logits/rejected": -2.2552614212036133, "logps/chosen": -16.21422004699707, "logps/rejected": -13.129284858703613, "loss": 0.4354, "rewards/accuracies": 1.0, "rewards/chosen": 1.9813756942749023, "rewards/margins": 0.6060179471969604, "rewards/rejected": 1.375357747077942, "step": 4845 }, { "epoch": 2.61, "learning_rate": 2.8326896011718647e-08, "logits/chosen": -2.1682045459747314, "logits/rejected": -2.1715986728668213, "logps/chosen": -1.4348390102386475, "logps/rejected": -5.799422740936279, "loss": 0.4161, "rewards/accuracies": 1.0, "rewards/chosen": 1.0791447162628174, "rewards/margins": 0.6615140438079834, "rewards/rejected": 0.4176306426525116, "step": 4846 }, { "epoch": 2.61, "learning_rate": 2.8307218427549485e-08, "logits/chosen": -2.0542566776275635, "logits/rejected": -2.032167434692383, "logps/chosen": -6.845292568206787, "logps/rejected": -6.381796836853027, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 1.3500499725341797, "rewards/margins": 0.8680466413497925, "rewards/rejected": 0.4820033013820648, "step": 4847 }, { "epoch": 2.61, "learning_rate": 2.8287544981422005e-08, "logits/chosen": -2.089111328125, "logits/rejected": -2.277658700942993, "logps/chosen": -5.82602071762085, "logps/rejected": -0.7022767066955566, "loss": 0.628, "rewards/accuracies": 1.0, "rewards/chosen": 0.99249666929245, "rewards/margins": 0.134840726852417, "rewards/rejected": 0.857655942440033, "step": 4848 }, { "epoch": 2.62, "learning_rate": 2.826787567708905e-08, "logits/chosen": -2.222573757171631, "logits/rejected": -2.1875133514404297, "logps/chosen": -26.179595947265625, "logps/rejected": -1.8337304592132568, "loss": 0.3173, "rewards/accuracies": 1.0, "rewards/chosen": 2.0225300788879395, "rewards/margins": 0.9849734306335449, "rewards/rejected": 1.0375566482543945, "step": 4849 }, { "epoch": 2.62, "learning_rate": 2.8248210518302673e-08, "logits/chosen": -2.1259400844573975, "logits/rejected": -2.1262974739074707, "logps/chosen": -1.5707781314849854, "logps/rejected": -2.3213582038879395, "loss": 0.5581, "rewards/accuracies": 1.0, "rewards/chosen": 1.1662319898605347, "rewards/margins": 0.2912060618400574, "rewards/rejected": 0.8750259280204773, "step": 4850 }, { "epoch": 2.62, "learning_rate": 2.8228549508814127e-08, "logits/chosen": -2.2261712551116943, "logits/rejected": -2.219820499420166, "logps/chosen": -1.464435338973999, "logps/rejected": -10.928552627563477, "loss": 0.34, "rewards/accuracies": 1.0, "rewards/chosen": 1.3848836421966553, "rewards/margins": 0.9041523933410645, "rewards/rejected": 0.48073121905326843, "step": 4851 }, { "epoch": 2.62, "learning_rate": 2.8208892652373884e-08, "logits/chosen": -2.1104750633239746, "logits/rejected": -2.3407723903656006, "logps/chosen": -0.8696566224098206, "logps/rejected": -0.792114794254303, "loss": 0.6248, "rewards/accuracies": 1.0, "rewards/chosen": 1.2599856853485107, "rewards/margins": 0.14175915718078613, "rewards/rejected": 1.1182265281677246, "step": 4852 }, { "epoch": 2.62, "learning_rate": 2.8189239952731613e-08, "logits/chosen": -2.113070011138916, "logits/rejected": -2.115018367767334, "logps/chosen": -0.422669380903244, "logps/rejected": -3.7535831928253174, "loss": 0.5237, "rewards/accuracies": 1.0, "rewards/chosen": 0.7844075560569763, "rewards/margins": 0.3736055791378021, "rewards/rejected": 0.4108019769191742, "step": 4853 }, { "epoch": 2.62, "learning_rate": 2.8169591413636207e-08, "logits/chosen": -2.1037681102752686, "logits/rejected": -2.312349557876587, "logps/chosen": -0.9201549291610718, "logps/rejected": -0.9997087717056274, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.8951166272163391, "rewards/margins": 0.03319895267486572, "rewards/rejected": 0.8619176745414734, "step": 4854 }, { "epoch": 2.62, "learning_rate": 2.8149947038835746e-08, "logits/chosen": -2.1087331771850586, "logits/rejected": -2.269124746322632, "logps/chosen": -0.3106050491333008, "logps/rejected": -0.3292018175125122, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.9633362889289856, "rewards/margins": 0.03458130359649658, "rewards/rejected": 0.928754985332489, "step": 4855 }, { "epoch": 2.62, "learning_rate": 2.8130306832077534e-08, "logits/chosen": -2.1116793155670166, "logits/rejected": -2.121976137161255, "logps/chosen": -1.8918166160583496, "logps/rejected": -1.9410046339035034, "loss": 0.4982, "rewards/accuracies": 1.0, "rewards/chosen": 1.2040131092071533, "rewards/margins": 0.43726831674575806, "rewards/rejected": 0.7667447924613953, "step": 4856 }, { "epoch": 2.62, "learning_rate": 2.8110670797108054e-08, "logits/chosen": -2.080421209335327, "logits/rejected": -2.322786331176758, "logps/chosen": -0.29305461049079895, "logps/rejected": -0.2839914560317993, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.9264278411865234, "rewards/margins": 0.03254365921020508, "rewards/rejected": 0.8938841819763184, "step": 4857 }, { "epoch": 2.62, "learning_rate": 2.809103893767303e-08, "logits/chosen": -2.1222424507141113, "logits/rejected": -2.1290736198425293, "logps/chosen": -4.030035018920898, "logps/rejected": -4.5884318351745605, "loss": 0.4151, "rewards/accuracies": 1.0, "rewards/chosen": 1.1910187005996704, "rewards/margins": 0.6645788550376892, "rewards/rejected": 0.5264398455619812, "step": 4858 }, { "epoch": 2.62, "learning_rate": 2.807141125751735e-08, "logits/chosen": -2.1539180278778076, "logits/rejected": -2.333433151245117, "logps/chosen": -0.3130999207496643, "logps/rejected": -0.2913638949394226, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.8701153993606567, "rewards/margins": 0.017155766487121582, "rewards/rejected": 0.8529596328735352, "step": 4859 }, { "epoch": 2.62, "learning_rate": 2.8051787760385136e-08, "logits/chosen": -2.1266963481903076, "logits/rejected": -2.308098554611206, "logps/chosen": -1.898626685142517, "logps/rejected": -0.9066643714904785, "loss": 0.6589, "rewards/accuracies": 1.0, "rewards/chosen": 0.8956424593925476, "rewards/margins": 0.0697181224822998, "rewards/rejected": 0.8259243369102478, "step": 4860 }, { "epoch": 2.62, "learning_rate": 2.80321684500197e-08, "logits/chosen": -2.2059054374694824, "logits/rejected": -2.3043911457061768, "logps/chosen": -2.3781652450561523, "logps/rejected": -2.34677791595459, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9620117545127869, "rewards/margins": 0.0067961812019348145, "rewards/rejected": 0.955215573310852, "step": 4861 }, { "epoch": 2.62, "learning_rate": 2.8012553330163547e-08, "logits/chosen": -2.1206352710723877, "logits/rejected": -2.1481773853302, "logps/chosen": -5.17647647857666, "logps/rejected": -8.858585357666016, "loss": 0.3858, "rewards/accuracies": 1.0, "rewards/chosen": 1.6800072193145752, "rewards/margins": 0.7532820105552673, "rewards/rejected": 0.9267252087593079, "step": 4862 }, { "epoch": 2.62, "learning_rate": 2.79929424045584e-08, "logits/chosen": -2.0188543796539307, "logits/rejected": -2.013838291168213, "logps/chosen": -3.2588083744049072, "logps/rejected": -3.7506749629974365, "loss": 0.3394, "rewards/accuracies": 1.0, "rewards/chosen": 1.4866074323654175, "rewards/margins": 0.9061519503593445, "rewards/rejected": 0.580455482006073, "step": 4863 }, { "epoch": 2.62, "learning_rate": 2.7973335676945147e-08, "logits/chosen": -2.1653783321380615, "logits/rejected": -2.161233901977539, "logps/chosen": -3.375349283218384, "logps/rejected": -4.148859977722168, "loss": 0.5261, "rewards/accuracies": 1.0, "rewards/chosen": 0.8209306001663208, "rewards/margins": 0.36780989170074463, "rewards/rejected": 0.45312070846557617, "step": 4864 }, { "epoch": 2.62, "learning_rate": 2.7953733151063946e-08, "logits/chosen": -2.129807710647583, "logits/rejected": -2.1333887577056885, "logps/chosen": -0.47252190113067627, "logps/rejected": -8.945901870727539, "loss": 0.3839, "rewards/accuracies": 1.0, "rewards/chosen": 1.2043884992599487, "rewards/margins": 0.75938880443573, "rewards/rejected": 0.44499969482421875, "step": 4865 }, { "epoch": 2.62, "learning_rate": 2.7934134830654086e-08, "logits/chosen": -1.9606581926345825, "logits/rejected": -2.216977834701538, "logps/chosen": -0.23028364777565002, "logps/rejected": -0.265259325504303, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.9463551640510559, "rewards/margins": -0.01493990421295166, "rewards/rejected": 0.9612950682640076, "step": 4866 }, { "epoch": 2.63, "learning_rate": 2.7914540719454093e-08, "logits/chosen": -2.0651447772979736, "logits/rejected": -2.2516045570373535, "logps/chosen": -0.26443731784820557, "logps/rejected": -0.2520981431007385, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.90962153673172, "rewards/margins": -0.007738590240478516, "rewards/rejected": 0.9173601269721985, "step": 4867 }, { "epoch": 2.63, "learning_rate": 2.7894950821201635e-08, "logits/chosen": -2.1132214069366455, "logits/rejected": -2.106417655944824, "logps/chosen": -7.311728477478027, "logps/rejected": -4.779296875, "loss": 0.256, "rewards/accuracies": 1.0, "rewards/chosen": 1.6719211339950562, "rewards/margins": 1.231904149055481, "rewards/rejected": 0.4400169551372528, "step": 4868 }, { "epoch": 2.63, "learning_rate": 2.7875365139633643e-08, "logits/chosen": -2.1134033203125, "logits/rejected": -2.247274160385132, "logps/chosen": -3.5738072395324707, "logps/rejected": -3.346130847930908, "loss": 0.7225, "rewards/accuracies": 0.0, "rewards/chosen": 0.7253005504608154, "rewards/margins": -0.0578349232673645, "rewards/rejected": 0.7831354737281799, "step": 4869 }, { "epoch": 2.63, "learning_rate": 2.78557836784862e-08, "logits/chosen": -2.0485410690307617, "logits/rejected": -2.3470489978790283, "logps/chosen": -0.7285796403884888, "logps/rejected": -0.7695854902267456, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9070486426353455, "rewards/margins": 0.02337116003036499, "rewards/rejected": 0.8836774826049805, "step": 4870 }, { "epoch": 2.63, "learning_rate": 2.7836206441494614e-08, "logits/chosen": -2.125194787979126, "logits/rejected": -2.383051872253418, "logps/chosen": -11.518468856811523, "logps/rejected": -7.110739707946777, "loss": 0.8185, "rewards/accuracies": 0.0, "rewards/chosen": 1.0318987369537354, "rewards/margins": -0.23678386211395264, "rewards/rejected": 1.268682599067688, "step": 4871 }, { "epoch": 2.63, "learning_rate": 2.7816633432393365e-08, "logits/chosen": -2.2083544731140137, "logits/rejected": -2.277517557144165, "logps/chosen": -2.2075066566467285, "logps/rejected": -11.990983009338379, "loss": 0.431, "rewards/accuracies": 1.0, "rewards/chosen": 1.4609565734863281, "rewards/margins": 0.6183541417121887, "rewards/rejected": 0.8426024317741394, "step": 4872 }, { "epoch": 2.63, "learning_rate": 2.7797064654916135e-08, "logits/chosen": -2.0594961643218994, "logits/rejected": -2.1801300048828125, "logps/chosen": -0.3593873977661133, "logps/rejected": -16.455612182617188, "loss": 0.5147, "rewards/accuracies": 1.0, "rewards/chosen": 0.9890589714050293, "rewards/margins": 0.39592885971069336, "rewards/rejected": 0.5931301116943359, "step": 4873 }, { "epoch": 2.63, "learning_rate": 2.7777500112795803e-08, "logits/chosen": -2.00337815284729, "logits/rejected": -2.368169069290161, "logps/chosen": -2.9772088527679443, "logps/rejected": -3.1918277740478516, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 1.1127595901489258, "rewards/margins": 0.0060569047927856445, "rewards/rejected": 1.1067026853561401, "step": 4874 }, { "epoch": 2.63, "learning_rate": 2.7757939809764407e-08, "logits/chosen": -1.995666742324829, "logits/rejected": -2.2686691284179688, "logps/chosen": -0.36358386278152466, "logps/rejected": -0.39806440472602844, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9702612161636353, "rewards/margins": 0.006751656532287598, "rewards/rejected": 0.9635095596313477, "step": 4875 }, { "epoch": 2.63, "learning_rate": 2.7738383749553252e-08, "logits/chosen": -2.0280847549438477, "logits/rejected": -2.0328478813171387, "logps/chosen": -2.7502036094665527, "logps/rejected": -0.655531644821167, "loss": 0.6308, "rewards/accuracies": 1.0, "rewards/chosen": 1.1428431272506714, "rewards/margins": 0.12888765335083008, "rewards/rejected": 1.0139554738998413, "step": 4876 }, { "epoch": 2.63, "learning_rate": 2.771883193589275e-08, "logits/chosen": -2.21276593208313, "logits/rejected": -2.208890199661255, "logps/chosen": -5.827294826507568, "logps/rejected": -6.544423580169678, "loss": 0.3658, "rewards/accuracies": 1.0, "rewards/chosen": 1.2513169050216675, "rewards/margins": 0.8170879483222961, "rewards/rejected": 0.43422895669937134, "step": 4877 }, { "epoch": 2.63, "learning_rate": 2.769928437251256e-08, "logits/chosen": -2.131446599960327, "logits/rejected": -2.1468329429626465, "logps/chosen": -2.886868476867676, "logps/rejected": -5.0963826179504395, "loss": 0.4595, "rewards/accuracies": 1.0, "rewards/chosen": 1.384700894355774, "rewards/margins": 0.538981556892395, "rewards/rejected": 0.8457193374633789, "step": 4878 }, { "epoch": 2.63, "learning_rate": 2.76797410631415e-08, "logits/chosen": -2.0074098110198975, "logits/rejected": -2.0069363117218018, "logps/chosen": -0.3638457953929901, "logps/rejected": -1.9170563220977783, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 0.8916986584663391, "rewards/margins": 0.27998989820480347, "rewards/rejected": 0.6117087602615356, "step": 4879 }, { "epoch": 2.63, "learning_rate": 2.7660202011507584e-08, "logits/chosen": -2.0992157459259033, "logits/rejected": -2.10001802444458, "logps/chosen": -0.605869710445404, "logps/rejected": -5.909177780151367, "loss": 0.4929, "rewards/accuracies": 1.0, "rewards/chosen": 0.8922699093818665, "rewards/margins": 0.4508804380893707, "rewards/rejected": 0.4413894712924957, "step": 4880 }, { "epoch": 2.63, "learning_rate": 2.764066722133801e-08, "logits/chosen": -2.1107966899871826, "logits/rejected": -2.3259084224700928, "logps/chosen": -0.36358416080474854, "logps/rejected": -0.3535209894180298, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 1.0439038276672363, "rewards/margins": 0.03628218173980713, "rewards/rejected": 1.0076216459274292, "step": 4881 }, { "epoch": 2.63, "learning_rate": 2.7621136696359183e-08, "logits/chosen": -2.0801053047180176, "logits/rejected": -2.1346826553344727, "logps/chosen": -4.810549259185791, "logps/rejected": -12.235076904296875, "loss": 0.3142, "rewards/accuracies": 1.0, "rewards/chosen": 1.5398472547531128, "rewards/margins": 0.9965428113937378, "rewards/rejected": 0.543304443359375, "step": 4882 }, { "epoch": 2.63, "learning_rate": 2.760161044029667e-08, "logits/chosen": -2.0911290645599365, "logits/rejected": -2.3367812633514404, "logps/chosen": -0.5783022046089172, "logps/rejected": -0.5899041295051575, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.9719142913818359, "rewards/margins": 0.028394997119903564, "rewards/rejected": 0.9435192942619324, "step": 4883 }, { "epoch": 2.63, "learning_rate": 2.758208845687525e-08, "logits/chosen": -2.2397398948669434, "logits/rejected": -2.2075648307800293, "logps/chosen": -22.723548889160156, "logps/rejected": -11.220826148986816, "loss": 0.3332, "rewards/accuracies": 1.0, "rewards/chosen": 2.021446943283081, "rewards/margins": 0.9276207685470581, "rewards/rejected": 1.093826174736023, "step": 4884 }, { "epoch": 2.63, "learning_rate": 2.756257074981885e-08, "logits/chosen": -2.0275816917419434, "logits/rejected": -2.022440195083618, "logps/chosen": -5.66033411026001, "logps/rejected": -4.334797382354736, "loss": 0.2772, "rewards/accuracies": 1.0, "rewards/chosen": 1.6870691776275635, "rewards/margins": 1.1412642002105713, "rewards/rejected": 0.5458049178123474, "step": 4885 }, { "epoch": 2.64, "learning_rate": 2.7543057322850577e-08, "logits/chosen": -2.1435866355895996, "logits/rejected": -2.3090529441833496, "logps/chosen": -0.615507960319519, "logps/rejected": -0.6562316417694092, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 1.0258550643920898, "rewards/margins": 0.002450227737426758, "rewards/rejected": 1.023404836654663, "step": 4886 }, { "epoch": 2.64, "learning_rate": 2.7523548179692802e-08, "logits/chosen": -2.2217025756835938, "logits/rejected": -2.2282562255859375, "logps/chosen": -1.7487215995788574, "logps/rejected": -5.099285125732422, "loss": 0.4377, "rewards/accuracies": 1.0, "rewards/chosen": 0.9809364676475525, "rewards/margins": 0.5994182825088501, "rewards/rejected": 0.3815181851387024, "step": 4887 }, { "epoch": 2.64, "learning_rate": 2.7504043324066994e-08, "logits/chosen": -2.0651655197143555, "logits/rejected": -2.074082136154175, "logps/chosen": -0.9052717685699463, "logps/rejected": -2.8234403133392334, "loss": 0.4606, "rewards/accuracies": 1.0, "rewards/chosen": 1.0960988998413086, "rewards/margins": 0.535975992679596, "rewards/rejected": 0.5601229071617126, "step": 4888 }, { "epoch": 2.64, "learning_rate": 2.7484542759693842e-08, "logits/chosen": -2.037841796875, "logits/rejected": -2.0388669967651367, "logps/chosen": -2.789205312728882, "logps/rejected": -1.1632335186004639, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.4662039279937744, "rewards/margins": 0.520267128944397, "rewards/rejected": 0.9459367990493774, "step": 4889 }, { "epoch": 2.64, "learning_rate": 2.7465046490293202e-08, "logits/chosen": -2.086320161819458, "logits/rejected": -2.295699119567871, "logps/chosen": -6.885972499847412, "logps/rejected": -0.6370411515235901, "loss": 0.7348, "rewards/accuracies": 0.0, "rewards/chosen": 0.8036083579063416, "rewards/margins": -0.08168745040893555, "rewards/rejected": 0.8852958083152771, "step": 4890 }, { "epoch": 2.64, "learning_rate": 2.7445554519584125e-08, "logits/chosen": -2.1911964416503906, "logits/rejected": -2.331303119659424, "logps/chosen": -0.26609236001968384, "logps/rejected": -0.24186107516288757, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.9103608131408691, "rewards/margins": 0.023070037364959717, "rewards/rejected": 0.8872907757759094, "step": 4891 }, { "epoch": 2.64, "learning_rate": 2.7426066851284813e-08, "logits/chosen": -2.1416983604431152, "logits/rejected": -2.336355209350586, "logps/chosen": -0.8372403383255005, "logps/rejected": -0.859243631362915, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 1.025294542312622, "rewards/margins": 0.017859816551208496, "rewards/rejected": 1.0074347257614136, "step": 4892 }, { "epoch": 2.64, "learning_rate": 2.7406583489112688e-08, "logits/chosen": -2.1284995079040527, "logits/rejected": -2.268958330154419, "logps/chosen": -0.388944149017334, "logps/rejected": -0.4597614109516144, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.836090087890625, "rewards/margins": -0.005064129829406738, "rewards/rejected": 0.8411542177200317, "step": 4893 }, { "epoch": 2.64, "learning_rate": 2.738710443678433e-08, "logits/chosen": -2.0511646270751953, "logits/rejected": -2.039299964904785, "logps/chosen": -3.3142662048339844, "logps/rejected": -2.981476068496704, "loss": 0.3258, "rewards/accuracies": 1.0, "rewards/chosen": 1.7335503101348877, "rewards/margins": 0.9540733695030212, "rewards/rejected": 0.7794769406318665, "step": 4894 }, { "epoch": 2.64, "learning_rate": 2.736762969801548e-08, "logits/chosen": -2.1035616397857666, "logits/rejected": -2.3290886878967285, "logps/chosen": -1.0473840236663818, "logps/rejected": -1.0758311748504639, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 1.0819642543792725, "rewards/margins": 0.012107133865356445, "rewards/rejected": 1.069857120513916, "step": 4895 }, { "epoch": 2.64, "learning_rate": 2.7348159276521098e-08, "logits/chosen": -2.0466701984405518, "logits/rejected": -2.0516438484191895, "logps/chosen": -0.36738717555999756, "logps/rejected": -4.989296913146973, "loss": 0.4609, "rewards/accuracies": 1.0, "rewards/chosen": 0.8466903567314148, "rewards/margins": 0.5354135036468506, "rewards/rejected": 0.3112768232822418, "step": 4896 }, { "epoch": 2.64, "learning_rate": 2.732869317601527e-08, "logits/chosen": -2.1380863189697266, "logits/rejected": -2.1430249214172363, "logps/chosen": -1.860779047012329, "logps/rejected": -5.962587356567383, "loss": 0.3623, "rewards/accuracies": 1.0, "rewards/chosen": 1.1823734045028687, "rewards/margins": 0.8287434577941895, "rewards/rejected": 0.3536299765110016, "step": 4897 }, { "epoch": 2.64, "learning_rate": 2.7309231400211302e-08, "logits/chosen": -1.9916936159133911, "logits/rejected": -1.9914848804473877, "logps/chosen": -1.6344231367111206, "logps/rejected": -3.1216273307800293, "loss": 0.575, "rewards/accuracies": 1.0, "rewards/chosen": 1.1153358221054077, "rewards/margins": 0.2521701455116272, "rewards/rejected": 0.8631656765937805, "step": 4898 }, { "epoch": 2.64, "learning_rate": 2.728977395282165e-08, "logits/chosen": -2.0963714122772217, "logits/rejected": -2.332897424697876, "logps/chosen": -1.1536250114440918, "logps/rejected": -0.8699555397033691, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 0.5822393894195557, "rewards/margins": -0.015510976314544678, "rewards/rejected": 0.5977503657341003, "step": 4899 }, { "epoch": 2.64, "learning_rate": 2.727032083755795e-08, "logits/chosen": -2.1774308681488037, "logits/rejected": -2.1668829917907715, "logps/chosen": -0.7053819894790649, "logps/rejected": -6.252762794494629, "loss": 0.4183, "rewards/accuracies": 1.0, "rewards/chosen": 1.1175200939178467, "rewards/margins": 0.6550799608230591, "rewards/rejected": 0.4624401032924652, "step": 4900 }, { "epoch": 2.64, "learning_rate": 2.7250872058131007e-08, "logits/chosen": -2.0126137733459473, "logits/rejected": -2.2826006412506104, "logps/chosen": -0.3265337347984314, "logps/rejected": -0.3915354013442993, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.9857574701309204, "rewards/margins": 0.028011322021484375, "rewards/rejected": 0.957746148109436, "step": 4901 }, { "epoch": 2.64, "learning_rate": 2.723142761825082e-08, "logits/chosen": -2.154958963394165, "logits/rejected": -2.1561436653137207, "logps/chosen": -1.2724740505218506, "logps/rejected": -12.706503868103027, "loss": 0.32, "rewards/accuracies": 1.0, "rewards/chosen": 1.1833621263504028, "rewards/margins": 0.975014328956604, "rewards/rejected": 0.20834779739379883, "step": 4902 }, { "epoch": 2.64, "learning_rate": 2.7211987521626535e-08, "logits/chosen": -2.0276200771331787, "logits/rejected": -2.420128345489502, "logps/chosen": -7.152068138122559, "logps/rejected": -12.858575820922852, "loss": 0.9575, "rewards/accuracies": 0.0, "rewards/chosen": 0.9328274726867676, "rewards/margins": -0.47331082820892334, "rewards/rejected": 1.406138300895691, "step": 4903 }, { "epoch": 2.65, "learning_rate": 2.7192551771966472e-08, "logits/chosen": -2.088322162628174, "logits/rejected": -2.3234739303588867, "logps/chosen": -0.2471412718296051, "logps/rejected": -0.21419496834278107, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9342390894889832, "rewards/margins": 0.01710277795791626, "rewards/rejected": 0.9171363115310669, "step": 4904 }, { "epoch": 2.65, "learning_rate": 2.7173120372978143e-08, "logits/chosen": -2.1074700355529785, "logits/rejected": -2.29445219039917, "logps/chosen": -0.1736157238483429, "logps/rejected": -0.20062780380249023, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.83399897813797, "rewards/margins": 0.005898952484130859, "rewards/rejected": 0.8281000256538391, "step": 4905 }, { "epoch": 2.65, "learning_rate": 2.7153693328368205e-08, "logits/chosen": -2.032653570175171, "logits/rejected": -2.035555362701416, "logps/chosen": -1.8244001865386963, "logps/rejected": -5.279690742492676, "loss": 0.2357, "rewards/accuracies": 1.0, "rewards/chosen": 1.7284185886383057, "rewards/margins": 1.3250463008880615, "rewards/rejected": 0.40337228775024414, "step": 4906 }, { "epoch": 2.65, "learning_rate": 2.7134270641842505e-08, "logits/chosen": -2.079469919204712, "logits/rejected": -2.299706220626831, "logps/chosen": -5.449191570281982, "logps/rejected": -1.4058427810668945, "loss": 0.7066, "rewards/accuracies": 0.0, "rewards/chosen": 1.046534538269043, "rewards/margins": -0.02670431137084961, "rewards/rejected": 1.0732388496398926, "step": 4907 }, { "epoch": 2.65, "learning_rate": 2.7114852317106018e-08, "logits/chosen": -2.049801826477051, "logits/rejected": -2.046062469482422, "logps/chosen": -5.084499835968018, "logps/rejected": -4.757769584655762, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 1.2057212591171265, "rewards/margins": 0.7616961002349854, "rewards/rejected": 0.4440251290798187, "step": 4908 }, { "epoch": 2.65, "learning_rate": 2.7095438357862972e-08, "logits/chosen": -2.1662771701812744, "logits/rejected": -2.35813570022583, "logps/chosen": -1.0142682790756226, "logps/rejected": -0.9844329357147217, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.7006195783615112, "rewards/margins": 0.0016444921493530273, "rewards/rejected": 0.6989750862121582, "step": 4909 }, { "epoch": 2.65, "learning_rate": 2.7076028767816672e-08, "logits/chosen": -2.1166203022003174, "logits/rejected": -2.1355645656585693, "logps/chosen": -4.0051093101501465, "logps/rejected": -3.5787222385406494, "loss": 0.4523, "rewards/accuracies": 1.0, "rewards/chosen": 1.305883765220642, "rewards/margins": 0.5588120818138123, "rewards/rejected": 0.7470716834068298, "step": 4910 }, { "epoch": 2.65, "learning_rate": 2.705662355066964e-08, "logits/chosen": -2.181295871734619, "logits/rejected": -2.1799519062042236, "logps/chosen": -2.3728601932525635, "logps/rejected": -13.617122650146484, "loss": 0.5095, "rewards/accuracies": 1.0, "rewards/chosen": 0.9924905896186829, "rewards/margins": 0.40867727994918823, "rewards/rejected": 0.5838133096694946, "step": 4911 }, { "epoch": 2.65, "learning_rate": 2.703722271012354e-08, "logits/chosen": -2.0024449825286865, "logits/rejected": -2.0096852779388428, "logps/chosen": -2.6491611003875732, "logps/rejected": -3.3273544311523438, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 1.2609946727752686, "rewards/margins": 0.6951888203620911, "rewards/rejected": 0.5658058524131775, "step": 4912 }, { "epoch": 2.65, "learning_rate": 2.7017826249879237e-08, "logits/chosen": -2.140535593032837, "logits/rejected": -2.323176383972168, "logps/chosen": -2.3123745918273926, "logps/rejected": -2.327937126159668, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9961332678794861, "rewards/margins": 0.022287607192993164, "rewards/rejected": 0.9738456606864929, "step": 4913 }, { "epoch": 2.65, "learning_rate": 2.6998434173636686e-08, "logits/chosen": -2.16109299659729, "logits/rejected": -2.2170770168304443, "logps/chosen": -10.481335639953613, "logps/rejected": -20.502042770385742, "loss": 0.3282, "rewards/accuracies": 1.0, "rewards/chosen": 1.7880871295928955, "rewards/margins": 0.9456087350845337, "rewards/rejected": 0.8424783945083618, "step": 4914 }, { "epoch": 2.65, "learning_rate": 2.6979046485095085e-08, "logits/chosen": -2.017139196395874, "logits/rejected": -2.286898374557495, "logps/chosen": -1.01582670211792, "logps/rejected": -1.191282033920288, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8565172553062439, "rewards/margins": 0.013456761837005615, "rewards/rejected": 0.8430604934692383, "step": 4915 }, { "epoch": 2.65, "learning_rate": 2.695966318795276e-08, "logits/chosen": -2.2505295276641846, "logits/rejected": -2.197615623474121, "logps/chosen": -21.036026000976562, "logps/rejected": -3.691640853881836, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 2.384838819503784, "rewards/margins": 1.5549123287200928, "rewards/rejected": 0.8299264311790466, "step": 4916 }, { "epoch": 2.65, "learning_rate": 2.69402842859072e-08, "logits/chosen": -2.2250962257385254, "logits/rejected": -2.219844102859497, "logps/chosen": -5.8560075759887695, "logps/rejected": -4.393594741821289, "loss": 0.446, "rewards/accuracies": 1.0, "rewards/chosen": 0.9858426451683044, "rewards/margins": 0.5762683153152466, "rewards/rejected": 0.40957432985305786, "step": 4917 }, { "epoch": 2.65, "learning_rate": 2.6920909782655056e-08, "logits/chosen": -2.0991883277893066, "logits/rejected": -2.100778102874756, "logps/chosen": -1.62546968460083, "logps/rejected": -1.0762832164764404, "loss": 0.5234, "rewards/accuracies": 1.0, "rewards/chosen": 1.3986263275146484, "rewards/margins": 0.3742481470108032, "rewards/rejected": 1.0243781805038452, "step": 4918 }, { "epoch": 2.65, "learning_rate": 2.6901539681892148e-08, "logits/chosen": -2.0781946182250977, "logits/rejected": -2.068626880645752, "logps/chosen": -5.396323204040527, "logps/rejected": -5.538040637969971, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 1.2262412309646606, "rewards/margins": 0.7895289063453674, "rewards/rejected": 0.4367123246192932, "step": 4919 }, { "epoch": 2.65, "learning_rate": 2.688217398731344e-08, "logits/chosen": -2.1071360111236572, "logits/rejected": -2.1137635707855225, "logps/chosen": -0.9312980771064758, "logps/rejected": -3.312880516052246, "loss": 0.4446, "rewards/accuracies": 1.0, "rewards/chosen": 1.1361163854599, "rewards/margins": 0.580092191696167, "rewards/rejected": 0.5560241937637329, "step": 4920 }, { "epoch": 2.65, "learning_rate": 2.6862812702613085e-08, "logits/chosen": -2.1604464054107666, "logits/rejected": -2.1618432998657227, "logps/chosen": -1.411534070968628, "logps/rejected": -3.8403377532958984, "loss": 0.4933, "rewards/accuracies": 1.0, "rewards/chosen": 1.0200327634811401, "rewards/margins": 0.4498838186264038, "rewards/rejected": 0.5701489448547363, "step": 4921 }, { "epoch": 2.65, "learning_rate": 2.6843455831484374e-08, "logits/chosen": -2.03474497795105, "logits/rejected": -2.0240323543548584, "logps/chosen": -11.994857788085938, "logps/rejected": -2.7885820865631104, "loss": 0.617, "rewards/accuracies": 1.0, "rewards/chosen": 0.9939039349555969, "rewards/margins": 0.15861797332763672, "rewards/rejected": 0.8352859616279602, "step": 4922 }, { "epoch": 2.66, "learning_rate": 2.6824103377619744e-08, "logits/chosen": -2.1259968280792236, "logits/rejected": -2.2100093364715576, "logps/chosen": -1.317688226699829, "logps/rejected": -16.148479461669922, "loss": 0.6024, "rewards/accuracies": 1.0, "rewards/chosen": 1.2664021253585815, "rewards/margins": 0.1906285285949707, "rewards/rejected": 1.0757735967636108, "step": 4923 }, { "epoch": 2.66, "learning_rate": 2.6804755344710823e-08, "logits/chosen": -2.144171714782715, "logits/rejected": -2.1374714374542236, "logps/chosen": -4.052964210510254, "logps/rejected": -4.733316898345947, "loss": 0.2795, "rewards/accuracies": 1.0, "rewards/chosen": 1.49204683303833, "rewards/margins": 1.131618857383728, "rewards/rejected": 0.36042800545692444, "step": 4924 }, { "epoch": 2.66, "learning_rate": 2.678541173644836e-08, "logits/chosen": -2.12259578704834, "logits/rejected": -2.115053653717041, "logps/chosen": -4.470021724700928, "logps/rejected": -5.9831624031066895, "loss": 0.3762, "rewards/accuracies": 1.0, "rewards/chosen": 1.1661490201950073, "rewards/margins": 0.7836951017379761, "rewards/rejected": 0.38245388865470886, "step": 4925 }, { "epoch": 2.66, "learning_rate": 2.6766072556522278e-08, "logits/chosen": -2.1303365230560303, "logits/rejected": -2.129457473754883, "logps/chosen": -4.183636665344238, "logps/rejected": -6.274075984954834, "loss": 0.3147, "rewards/accuracies": 1.0, "rewards/chosen": 1.400131344795227, "rewards/margins": 0.9946733713150024, "rewards/rejected": 0.4054579436779022, "step": 4926 }, { "epoch": 2.66, "learning_rate": 2.6746737808621667e-08, "logits/chosen": -2.0790512561798096, "logits/rejected": -2.083113431930542, "logps/chosen": -2.123107433319092, "logps/rejected": -0.705868661403656, "loss": 0.4773, "rewards/accuracies": 1.0, "rewards/chosen": 1.3466414213180542, "rewards/margins": 0.4915139079093933, "rewards/rejected": 0.8551275134086609, "step": 4927 }, { "epoch": 2.66, "learning_rate": 2.672740749643474e-08, "logits/chosen": -1.9797886610031128, "logits/rejected": -2.32875657081604, "logps/chosen": -0.1054084300994873, "logps/rejected": -0.12490668892860413, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9206835031509399, "rewards/margins": 0.017746388912200928, "rewards/rejected": 0.902937114238739, "step": 4928 }, { "epoch": 2.66, "learning_rate": 2.670808162364889e-08, "logits/chosen": -2.1154966354370117, "logits/rejected": -2.111321449279785, "logps/chosen": -0.9792667031288147, "logps/rejected": -9.137497901916504, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 1.265298843383789, "rewards/margins": 1.050324559211731, "rewards/rejected": 0.2149743139743805, "step": 4929 }, { "epoch": 2.66, "learning_rate": 2.6688760193950654e-08, "logits/chosen": -2.062720537185669, "logits/rejected": -2.1684534549713135, "logps/chosen": -2.354261875152588, "logps/rejected": -10.78069019317627, "loss": 0.4519, "rewards/accuracies": 1.0, "rewards/chosen": 1.333025336265564, "rewards/margins": 0.559712827205658, "rewards/rejected": 0.773312509059906, "step": 4930 }, { "epoch": 2.66, "learning_rate": 2.6669443211025733e-08, "logits/chosen": -2.1178817749023438, "logits/rejected": -2.118251323699951, "logps/chosen": -1.0470117330551147, "logps/rejected": -1.2875635623931885, "loss": 0.6037, "rewards/accuracies": 1.0, "rewards/chosen": 1.0173746347427368, "rewards/margins": 0.18779510259628296, "rewards/rejected": 0.8295795321464539, "step": 4931 }, { "epoch": 2.66, "learning_rate": 2.6650130678558956e-08, "logits/chosen": -2.225407838821411, "logits/rejected": -2.3451855182647705, "logps/chosen": -0.42311450839042664, "logps/rejected": -0.39071354269981384, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 1.0547339916229248, "rewards/margins": 0.015335917472839355, "rewards/rejected": 1.0393980741500854, "step": 4932 }, { "epoch": 2.66, "learning_rate": 2.6630822600234316e-08, "logits/chosen": -2.0721981525421143, "logits/rejected": -2.2792751789093018, "logps/chosen": -0.9582796692848206, "logps/rejected": -1.1006978750228882, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 1.0827871561050415, "rewards/margins": 0.028751611709594727, "rewards/rejected": 1.0540355443954468, "step": 4933 }, { "epoch": 2.66, "learning_rate": 2.661151897973497e-08, "logits/chosen": -2.0493597984313965, "logits/rejected": -2.0424423217773438, "logps/chosen": -5.14155387878418, "logps/rejected": -2.384272575378418, "loss": 0.4743, "rewards/accuracies": 1.0, "rewards/chosen": 1.2623285055160522, "rewards/margins": 0.4994511008262634, "rewards/rejected": 0.7628774046897888, "step": 4934 }, { "epoch": 2.66, "learning_rate": 2.6592219820743193e-08, "logits/chosen": -2.095116138458252, "logits/rejected": -2.3573200702667236, "logps/chosen": -7.644374370574951, "logps/rejected": -5.2519330978393555, "loss": 0.8956, "rewards/accuracies": 0.0, "rewards/chosen": 0.3618210256099701, "rewards/margins": -0.3708142936229706, "rewards/rejected": 0.7326353192329407, "step": 4935 }, { "epoch": 2.66, "learning_rate": 2.657292512694045e-08, "logits/chosen": -2.1767642498016357, "logits/rejected": -2.2854907512664795, "logps/chosen": -10.670951843261719, "logps/rejected": -3.4216742515563965, "loss": 1.096, "rewards/accuracies": 0.0, "rewards/chosen": 0.0809105858206749, "rewards/margins": -0.6892207264900208, "rewards/rejected": 0.7701312899589539, "step": 4936 }, { "epoch": 2.66, "learning_rate": 2.6553634902007304e-08, "logits/chosen": -2.067167043685913, "logits/rejected": -2.0754170417785645, "logps/chosen": -0.8692334890365601, "logps/rejected": -5.027155876159668, "loss": 0.423, "rewards/accuracies": 1.0, "rewards/chosen": 0.8884181380271912, "rewards/margins": 0.6414849162101746, "rewards/rejected": 0.2469332218170166, "step": 4937 }, { "epoch": 2.66, "learning_rate": 2.653434914962352e-08, "logits/chosen": -2.1896955966949463, "logits/rejected": -2.145658254623413, "logps/chosen": -18.510009765625, "logps/rejected": -6.071661472320557, "loss": 0.1796, "rewards/accuracies": 1.0, "rewards/chosen": 1.9851276874542236, "rewards/margins": 1.6259183883666992, "rewards/rejected": 0.3592092990875244, "step": 4938 }, { "epoch": 2.66, "learning_rate": 2.6515067873467966e-08, "logits/chosen": -2.0273072719573975, "logits/rejected": -2.027949810028076, "logps/chosen": -1.116431474685669, "logps/rejected": -3.326878786087036, "loss": 0.4957, "rewards/accuracies": 1.0, "rewards/chosen": 1.1163110733032227, "rewards/margins": 0.4437488317489624, "rewards/rejected": 0.6725622415542603, "step": 4939 }, { "epoch": 2.66, "learning_rate": 2.6495791077218677e-08, "logits/chosen": -2.0082967281341553, "logits/rejected": -2.253223180770874, "logps/chosen": -0.20021513104438782, "logps/rejected": -0.22169238328933716, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.8682644963264465, "rewards/margins": -0.0009484291076660156, "rewards/rejected": 0.8692129254341125, "step": 4940 }, { "epoch": 2.67, "learning_rate": 2.6476518764552834e-08, "logits/chosen": -2.099344253540039, "logits/rejected": -2.111539363861084, "logps/chosen": -5.708061695098877, "logps/rejected": -5.020534515380859, "loss": 0.3496, "rewards/accuracies": 1.0, "rewards/chosen": 1.7887243032455444, "rewards/margins": 0.8709926009178162, "rewards/rejected": 0.9177317023277283, "step": 4941 }, { "epoch": 2.67, "learning_rate": 2.645725093914673e-08, "logits/chosen": -2.084620952606201, "logits/rejected": -2.316577672958374, "logps/chosen": -0.5202229022979736, "logps/rejected": -0.557404100894928, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.7961146235466003, "rewards/margins": 0.016697704792022705, "rewards/rejected": 0.7794169187545776, "step": 4942 }, { "epoch": 2.67, "learning_rate": 2.6437987604675897e-08, "logits/chosen": -2.182504415512085, "logits/rejected": -2.179612636566162, "logps/chosen": -3.265338897705078, "logps/rejected": -4.108997821807861, "loss": 0.2944, "rewards/accuracies": 1.0, "rewards/chosen": 1.5658756494522095, "rewards/margins": 1.0720534324645996, "rewards/rejected": 0.49382224678993225, "step": 4943 }, { "epoch": 2.67, "learning_rate": 2.641872876481488e-08, "logits/chosen": -2.1105237007141113, "logits/rejected": -2.062246799468994, "logps/chosen": -7.989970684051514, "logps/rejected": -6.390984058380127, "loss": 0.3486, "rewards/accuracies": 1.0, "rewards/chosen": 1.6368722915649414, "rewards/margins": 0.8743109107017517, "rewards/rejected": 0.7625613808631897, "step": 4944 }, { "epoch": 2.67, "learning_rate": 2.6399474423237455e-08, "logits/chosen": -2.058394193649292, "logits/rejected": -2.045870065689087, "logps/chosen": -2.749903678894043, "logps/rejected": -2.2625486850738525, "loss": 0.3395, "rewards/accuracies": 1.0, "rewards/chosen": 1.7357845306396484, "rewards/margins": 0.9057726860046387, "rewards/rejected": 0.8300118446350098, "step": 4945 }, { "epoch": 2.67, "learning_rate": 2.638022458361651e-08, "logits/chosen": -2.1272971630096436, "logits/rejected": -2.3259663581848145, "logps/chosen": -3.316789388656616, "logps/rejected": -1.1977795362472534, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 1.11284339427948, "rewards/margins": -0.02185201644897461, "rewards/rejected": 1.1346954107284546, "step": 4946 }, { "epoch": 2.67, "learning_rate": 2.6360979249624084e-08, "logits/chosen": -2.0073676109313965, "logits/rejected": -2.2963624000549316, "logps/chosen": -0.11619248241186142, "logps/rejected": -0.12196698039770126, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.9120623469352722, "rewards/margins": -0.00043261051177978516, "rewards/rejected": 0.912494957447052, "step": 4947 }, { "epoch": 2.67, "learning_rate": 2.6341738424931347e-08, "logits/chosen": -2.1621627807617188, "logits/rejected": -2.169142484664917, "logps/chosen": -4.836447715759277, "logps/rejected": -2.6935606002807617, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 1.2587281465530396, "rewards/margins": 0.013134956359863281, "rewards/rejected": 1.2455931901931763, "step": 4948 }, { "epoch": 2.67, "learning_rate": 2.632250211320861e-08, "logits/chosen": -2.055410623550415, "logits/rejected": -2.3251075744628906, "logps/chosen": -2.2601473331451416, "logps/rejected": -2.0634829998016357, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.680241048336029, "rewards/margins": 0.00014698505401611328, "rewards/rejected": 0.6800940632820129, "step": 4949 }, { "epoch": 2.67, "learning_rate": 2.6303270318125336e-08, "logits/chosen": -2.0441324710845947, "logits/rejected": -2.244060754776001, "logps/chosen": -3.4437999725341797, "logps/rejected": -0.5522921085357666, "loss": 0.6164, "rewards/accuracies": 1.0, "rewards/chosen": 1.1685725450515747, "rewards/margins": 0.15979933738708496, "rewards/rejected": 1.0087732076644897, "step": 4950 }, { "epoch": 2.67, "learning_rate": 2.6284043043350117e-08, "logits/chosen": -2.0837864875793457, "logits/rejected": -2.0806169509887695, "logps/chosen": -3.437598943710327, "logps/rejected": -3.4341931343078613, "loss": 0.5474, "rewards/accuracies": 1.0, "rewards/chosen": 1.103567361831665, "rewards/margins": 0.31641989946365356, "rewards/rejected": 0.7871474623680115, "step": 4951 }, { "epoch": 2.67, "learning_rate": 2.6264820292550684e-08, "logits/chosen": -2.0071613788604736, "logits/rejected": -2.017751455307007, "logps/chosen": -0.9326386451721191, "logps/rejected": -5.941716194152832, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 1.5484554767608643, "rewards/margins": 0.9858235716819763, "rewards/rejected": 0.5626319050788879, "step": 4952 }, { "epoch": 2.67, "learning_rate": 2.6245602069393867e-08, "logits/chosen": -2.088773488998413, "logits/rejected": -2.2854039669036865, "logps/chosen": -2.5933279991149902, "logps/rejected": -3.5024192333221436, "loss": 0.666, "rewards/accuracies": 1.0, "rewards/chosen": 0.7993473410606384, "rewards/margins": 0.05503249168395996, "rewards/rejected": 0.7443148493766785, "step": 4953 }, { "epoch": 2.67, "learning_rate": 2.6226388377545726e-08, "logits/chosen": -2.102015972137451, "logits/rejected": -2.356266498565674, "logps/chosen": -7.903858184814453, "logps/rejected": -13.848519325256348, "loss": 0.7333, "rewards/accuracies": 0.0, "rewards/chosen": 0.8871833682060242, "rewards/margins": -0.07874119281768799, "rewards/rejected": 0.9659245610237122, "step": 4954 }, { "epoch": 2.67, "learning_rate": 2.620717922067138e-08, "logits/chosen": -2.021115779876709, "logits/rejected": -2.3277158737182617, "logps/chosen": -0.3263089060783386, "logps/rejected": -0.36411982774734497, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 1.0290688276290894, "rewards/margins": 0.006466984748840332, "rewards/rejected": 1.022601842880249, "step": 4955 }, { "epoch": 2.67, "learning_rate": 2.61879746024351e-08, "logits/chosen": -2.183044672012329, "logits/rejected": -2.3161399364471436, "logps/chosen": -4.538626194000244, "logps/rejected": -2.1041100025177, "loss": 0.7201, "rewards/accuracies": 0.0, "rewards/chosen": 0.8420244455337524, "rewards/margins": -0.05311572551727295, "rewards/rejected": 0.8951401710510254, "step": 4956 }, { "epoch": 2.67, "learning_rate": 2.6168774526500293e-08, "logits/chosen": -2.154568672180176, "logits/rejected": -2.276066541671753, "logps/chosen": -0.3407125174999237, "logps/rejected": -0.347062349319458, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.7970842719078064, "rewards/margins": 0.009037435054779053, "rewards/rejected": 0.7880468368530273, "step": 4957 }, { "epoch": 2.67, "learning_rate": 2.614957899652949e-08, "logits/chosen": -2.1020607948303223, "logits/rejected": -2.120243549346924, "logps/chosen": -1.6000525951385498, "logps/rejected": -7.3114013671875, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": 1.1289358139038086, "rewards/margins": 0.39878690242767334, "rewards/rejected": 0.7301489114761353, "step": 4958 }, { "epoch": 2.67, "learning_rate": 2.6130388016184385e-08, "logits/chosen": -1.9972805976867676, "logits/rejected": -1.9921362400054932, "logps/chosen": -3.0835683345794678, "logps/rejected": -8.83344841003418, "loss": 0.4503, "rewards/accuracies": 1.0, "rewards/chosen": 0.9823795557022095, "rewards/margins": 0.5641563534736633, "rewards/rejected": 0.41822320222854614, "step": 4959 }, { "epoch": 2.68, "learning_rate": 2.6111201589125787e-08, "logits/chosen": -2.2427358627319336, "logits/rejected": -2.2615976333618164, "logps/chosen": -1.936584234237671, "logps/rejected": -9.282047271728516, "loss": 0.3029, "rewards/accuracies": 1.0, "rewards/chosen": 1.4670085906982422, "rewards/margins": 1.0391864776611328, "rewards/rejected": 0.4278221130371094, "step": 4960 }, { "epoch": 2.68, "learning_rate": 2.6092019719013614e-08, "logits/chosen": -2.1399848461151123, "logits/rejected": -2.114398956298828, "logps/chosen": -12.877389907836914, "logps/rejected": -1.2108478546142578, "loss": 0.3327, "rewards/accuracies": 1.0, "rewards/chosen": 1.9052479267120361, "rewards/margins": 0.9296937584877014, "rewards/rejected": 0.9755541682243347, "step": 4961 }, { "epoch": 2.68, "learning_rate": 2.6072842409506933e-08, "logits/chosen": -2.087566375732422, "logits/rejected": -2.0932066440582275, "logps/chosen": -2.9031484127044678, "logps/rejected": -6.881613254547119, "loss": 0.4078, "rewards/accuracies": 1.0, "rewards/chosen": 1.0007251501083374, "rewards/margins": 0.6862856149673462, "rewards/rejected": 0.3144395351409912, "step": 4962 }, { "epoch": 2.68, "learning_rate": 2.6053669664263956e-08, "logits/chosen": -2.2489447593688965, "logits/rejected": -2.2447092533111572, "logps/chosen": -7.795783519744873, "logps/rejected": -7.627455711364746, "loss": 0.2363, "rewards/accuracies": 1.0, "rewards/chosen": 1.4325459003448486, "rewards/margins": 1.3221222162246704, "rewards/rejected": 0.11042366176843643, "step": 4963 }, { "epoch": 2.68, "learning_rate": 2.6034501486942007e-08, "logits/chosen": -2.2370975017547607, "logits/rejected": -2.109832525253296, "logps/chosen": -42.20062255859375, "logps/rejected": -10.580851554870605, "loss": 0.1709, "rewards/accuracies": 1.0, "rewards/chosen": 2.5735185146331787, "rewards/margins": 1.6801214218139648, "rewards/rejected": 0.8933970332145691, "step": 4964 }, { "epoch": 2.68, "learning_rate": 2.6015337881197525e-08, "logits/chosen": -1.9953824281692505, "logits/rejected": -2.3263607025146484, "logps/chosen": -0.3918859362602234, "logps/rejected": -0.34391123056411743, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.905695378780365, "rewards/margins": 0.0010266304016113281, "rewards/rejected": 0.9046687483787537, "step": 4965 }, { "epoch": 2.68, "learning_rate": 2.5996178850686135e-08, "logits/chosen": -2.1099190711975098, "logits/rejected": -2.258721351623535, "logps/chosen": -2.371103286743164, "logps/rejected": -7.276630401611328, "loss": 0.558, "rewards/accuracies": 1.0, "rewards/chosen": 0.7020456194877625, "rewards/margins": 0.2914212942123413, "rewards/rejected": 0.41062432527542114, "step": 4966 }, { "epoch": 2.68, "learning_rate": 2.597702439906252e-08, "logits/chosen": -2.112637996673584, "logits/rejected": -2.148946762084961, "logps/chosen": -4.224666118621826, "logps/rejected": -14.871198654174805, "loss": 0.4223, "rewards/accuracies": 1.0, "rewards/chosen": 1.223128318786621, "rewards/margins": 0.6435332894325256, "rewards/rejected": 0.5795950293540955, "step": 4967 }, { "epoch": 2.68, "learning_rate": 2.5957874529980527e-08, "logits/chosen": -2.1395421028137207, "logits/rejected": -2.1423935890197754, "logps/chosen": -0.55799400806427, "logps/rejected": -4.252035617828369, "loss": 0.4764, "rewards/accuracies": 1.0, "rewards/chosen": 1.1453273296356201, "rewards/margins": 0.49377745389938354, "rewards/rejected": 0.6515498757362366, "step": 4968 }, { "epoch": 2.68, "learning_rate": 2.593872924709312e-08, "logits/chosen": -2.1363842487335205, "logits/rejected": -2.299466609954834, "logps/chosen": -1.2804324626922607, "logps/rejected": -1.4026433229446411, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.8734295964241028, "rewards/margins": 0.031440138816833496, "rewards/rejected": 0.8419894576072693, "step": 4969 }, { "epoch": 2.68, "learning_rate": 2.591958855405239e-08, "logits/chosen": -2.1658973693847656, "logits/rejected": -2.2980237007141113, "logps/chosen": -4.827017307281494, "logps/rejected": -4.858750343322754, "loss": 0.6436, "rewards/accuracies": 1.0, "rewards/chosen": 0.6921519041061401, "rewards/margins": 0.1016005277633667, "rewards/rejected": 0.5905513763427734, "step": 4970 }, { "epoch": 2.68, "learning_rate": 2.5900452454509547e-08, "logits/chosen": -2.138479709625244, "logits/rejected": -2.145052909851074, "logps/chosen": -2.799755334854126, "logps/rejected": -5.105336666107178, "loss": 0.3472, "rewards/accuracies": 1.0, "rewards/chosen": 1.2617168426513672, "rewards/margins": 0.8791303634643555, "rewards/rejected": 0.38258644938468933, "step": 4971 }, { "epoch": 2.68, "learning_rate": 2.5881320952114926e-08, "logits/chosen": -2.0341551303863525, "logits/rejected": -2.0391247272491455, "logps/chosen": -4.98476505279541, "logps/rejected": -1.7065893411636353, "loss": 0.6, "rewards/accuracies": 1.0, "rewards/chosen": 1.0044220685958862, "rewards/margins": 0.1958727240562439, "rewards/rejected": 0.8085493445396423, "step": 4972 }, { "epoch": 2.68, "learning_rate": 2.5862194050517994e-08, "logits/chosen": -2.1219968795776367, "logits/rejected": -2.1305930614471436, "logps/chosen": -4.4641571044921875, "logps/rejected": -2.281654119491577, "loss": 0.1909, "rewards/accuracies": 1.0, "rewards/chosen": 2.191420793533325, "rewards/margins": 1.5588693618774414, "rewards/rejected": 0.6325514912605286, "step": 4973 }, { "epoch": 2.68, "learning_rate": 2.5843071753367334e-08, "logits/chosen": -2.17042875289917, "logits/rejected": -2.3480637073516846, "logps/chosen": -2.138414144515991, "logps/rejected": -2.1034011840820312, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.5994362831115723, "rewards/margins": 0.008447468280792236, "rewards/rejected": 0.59098881483078, "step": 4974 }, { "epoch": 2.68, "learning_rate": 2.5823954064310648e-08, "logits/chosen": -2.1071250438690186, "logits/rejected": -2.1057491302490234, "logps/chosen": -0.8919085264205933, "logps/rejected": -4.685297012329102, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": 1.0858932733535767, "rewards/margins": 0.5299004912376404, "rewards/rejected": 0.5559927821159363, "step": 4975 }, { "epoch": 2.68, "learning_rate": 2.580484098699476e-08, "logits/chosen": -2.0267112255096436, "logits/rejected": -2.283895969390869, "logps/chosen": -9.781492233276367, "logps/rejected": -11.552667617797852, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 1.1501051187515259, "rewards/margins": 0.016087889671325684, "rewards/rejected": 1.1340172290802002, "step": 4976 }, { "epoch": 2.68, "learning_rate": 2.5785732525065617e-08, "logits/chosen": -2.1321613788604736, "logits/rejected": -2.1644387245178223, "logps/chosen": -1.774883508682251, "logps/rejected": -6.06623649597168, "loss": 0.6156, "rewards/accuracies": 1.0, "rewards/chosen": 1.110064148902893, "rewards/margins": 0.1616399884223938, "rewards/rejected": 0.9484241604804993, "step": 4977 }, { "epoch": 2.69, "learning_rate": 2.5766628682168283e-08, "logits/chosen": -2.044391632080078, "logits/rejected": -2.327578067779541, "logps/chosen": -11.128939628601074, "logps/rejected": -6.82161283493042, "loss": 0.6405, "rewards/accuracies": 1.0, "rewards/chosen": 0.9691773653030396, "rewards/margins": 0.10817921161651611, "rewards/rejected": 0.8609981536865234, "step": 4978 }, { "epoch": 2.69, "learning_rate": 2.574752946194695e-08, "logits/chosen": -2.165846586227417, "logits/rejected": -2.1586570739746094, "logps/chosen": -4.569019794464111, "logps/rejected": -4.1180949211120605, "loss": 0.6405, "rewards/accuracies": 1.0, "rewards/chosen": 0.7705201506614685, "rewards/margins": 0.10822141170501709, "rewards/rejected": 0.6622987389564514, "step": 4979 }, { "epoch": 2.69, "learning_rate": 2.5728434868044913e-08, "logits/chosen": -1.9662628173828125, "logits/rejected": -2.2654011249542236, "logps/chosen": -3.0441982746124268, "logps/rejected": -3.391832113265991, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.7281505465507507, "rewards/margins": 0.0029116272926330566, "rewards/rejected": 0.7252389192581177, "step": 4980 }, { "epoch": 2.69, "learning_rate": 2.57093449041046e-08, "logits/chosen": -2.159301519393921, "logits/rejected": -2.0731890201568604, "logps/chosen": -22.722530364990234, "logps/rejected": -3.5974628925323486, "loss": 0.4115, "rewards/accuracies": 1.0, "rewards/chosen": 1.5932353734970093, "rewards/margins": 0.6751015186309814, "rewards/rejected": 0.9181338548660278, "step": 4981 }, { "epoch": 2.69, "learning_rate": 2.5690259573767536e-08, "logits/chosen": -2.0776569843292236, "logits/rejected": -2.070361852645874, "logps/chosen": -2.325310230255127, "logps/rejected": -6.278182029724121, "loss": 0.4261, "rewards/accuracies": 1.0, "rewards/chosen": 1.3142263889312744, "rewards/margins": 0.6324408650398254, "rewards/rejected": 0.681785523891449, "step": 4982 }, { "epoch": 2.69, "learning_rate": 2.567117888067439e-08, "logits/chosen": -2.089334011077881, "logits/rejected": -2.2821192741394043, "logps/chosen": -0.3062823414802551, "logps/rejected": -0.28298768401145935, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.9296413660049438, "rewards/margins": 0.015708446502685547, "rewards/rejected": 0.9139329195022583, "step": 4983 }, { "epoch": 2.69, "learning_rate": 2.5652102828464916e-08, "logits/chosen": -2.142211675643921, "logits/rejected": -2.142636775970459, "logps/chosen": -2.9786171913146973, "logps/rejected": -1.3702434301376343, "loss": 0.5262, "rewards/accuracies": 1.0, "rewards/chosen": 1.044842004776001, "rewards/margins": 0.36740434169769287, "rewards/rejected": 0.6774376630783081, "step": 4984 }, { "epoch": 2.69, "learning_rate": 2.5633031420778e-08, "logits/chosen": -2.1033267974853516, "logits/rejected": -2.115393877029419, "logps/chosen": -1.061465859413147, "logps/rejected": -5.733224868774414, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": 1.1728836297988892, "rewards/margins": 0.2855229377746582, "rewards/rejected": 0.887360692024231, "step": 4985 }, { "epoch": 2.69, "learning_rate": 2.561396466125164e-08, "logits/chosen": -2.145695447921753, "logits/rejected": -2.2906410694122314, "logps/chosen": -1.1023328304290771, "logps/rejected": -0.7389480471611023, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.0766572952270508, "rewards/margins": 0.017002105712890625, "rewards/rejected": 1.0596551895141602, "step": 4986 }, { "epoch": 2.69, "learning_rate": 2.559490255352294e-08, "logits/chosen": -2.1587302684783936, "logits/rejected": -2.2229323387145996, "logps/chosen": -6.7369184494018555, "logps/rejected": -5.2909626960754395, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": 0.7667683959007263, "rewards/margins": -0.04915797710418701, "rewards/rejected": 0.8159263730049133, "step": 4987 }, { "epoch": 2.69, "learning_rate": 2.5575845101228144e-08, "logits/chosen": -2.094165563583374, "logits/rejected": -2.2634799480438232, "logps/chosen": -1.2409323453903198, "logps/rejected": -1.0812370777130127, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 1.0354126691818237, "rewards/margins": 0.01962721347808838, "rewards/rejected": 1.0157854557037354, "step": 4988 }, { "epoch": 2.69, "learning_rate": 2.555679230800258e-08, "logits/chosen": -2.1821799278259277, "logits/rejected": -2.104485273361206, "logps/chosen": -26.784568786621094, "logps/rejected": -3.490978240966797, "loss": 0.2577, "rewards/accuracies": 1.0, "rewards/chosen": 1.866658091545105, "rewards/margins": 1.224241018295288, "rewards/rejected": 0.6424171328544617, "step": 4989 }, { "epoch": 2.69, "learning_rate": 2.5537744177480704e-08, "logits/chosen": -2.0844106674194336, "logits/rejected": -2.1134414672851562, "logps/chosen": -1.7077549695968628, "logps/rejected": -8.600553512573242, "loss": 0.2957, "rewards/accuracies": 1.0, "rewards/chosen": 1.7104040384292603, "rewards/margins": 1.066924810409546, "rewards/rejected": 0.6434791684150696, "step": 4990 }, { "epoch": 2.69, "learning_rate": 2.551870071329604e-08, "logits/chosen": -2.090038776397705, "logits/rejected": -2.0818116664886475, "logps/chosen": -11.404923439025879, "logps/rejected": -6.195176124572754, "loss": 0.4312, "rewards/accuracies": 1.0, "rewards/chosen": 1.2727431058883667, "rewards/margins": 0.6177707314491272, "rewards/rejected": 0.6549723744392395, "step": 4991 }, { "epoch": 2.69, "learning_rate": 2.5499661919081273e-08, "logits/chosen": -1.9935861825942993, "logits/rejected": -1.9950778484344482, "logps/chosen": -1.8493729829788208, "logps/rejected": -3.6823019981384277, "loss": 0.4899, "rewards/accuracies": 1.0, "rewards/chosen": 0.9632980227470398, "rewards/margins": 0.45868420600891113, "rewards/rejected": 0.5046138167381287, "step": 4992 }, { "epoch": 2.69, "learning_rate": 2.548062779846818e-08, "logits/chosen": -2.0948991775512695, "logits/rejected": -2.0655205249786377, "logps/chosen": -5.031448841094971, "logps/rejected": -4.920228004455566, "loss": 0.3941, "rewards/accuracies": 1.0, "rewards/chosen": 1.2952884435653687, "rewards/margins": 0.7277359366416931, "rewards/rejected": 0.5675525069236755, "step": 4993 }, { "epoch": 2.69, "learning_rate": 2.5461598355087644e-08, "logits/chosen": -2.079799175262451, "logits/rejected": -2.065614938735962, "logps/chosen": -12.129840850830078, "logps/rejected": -4.361169338226318, "loss": 0.2992, "rewards/accuracies": 1.0, "rewards/chosen": 1.4766597747802734, "rewards/margins": 1.053208351135254, "rewards/rejected": 0.4234514832496643, "step": 4994 }, { "epoch": 2.69, "learning_rate": 2.544257359256966e-08, "logits/chosen": -2.1270744800567627, "logits/rejected": -2.3395493030548096, "logps/chosen": -0.759904146194458, "logps/rejected": -16.791826248168945, "loss": 0.5542, "rewards/accuracies": 1.0, "rewards/chosen": 1.2427200078964233, "rewards/margins": 0.30048060417175293, "rewards/rejected": 0.9422394037246704, "step": 4995 }, { "epoch": 2.69, "learning_rate": 2.542355351454333e-08, "logits/chosen": -2.124729633331299, "logits/rejected": -2.1203198432922363, "logps/chosen": -2.467536211013794, "logps/rejected": -2.5280425548553467, "loss": 0.3147, "rewards/accuracies": 1.0, "rewards/chosen": 1.6040581464767456, "rewards/margins": 0.9945070743560791, "rewards/rejected": 0.6095510721206665, "step": 4996 }, { "epoch": 2.7, "learning_rate": 2.5404538124636848e-08, "logits/chosen": -1.9942163228988647, "logits/rejected": -2.2809739112854004, "logps/chosen": -0.8819890022277832, "logps/rejected": -0.928736686706543, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 0.8599479794502258, "rewards/margins": -0.014323115348815918, "rewards/rejected": 0.8742710947990417, "step": 4997 }, { "epoch": 2.7, "learning_rate": 2.5385527426477517e-08, "logits/chosen": -2.031599998474121, "logits/rejected": -2.037942409515381, "logps/chosen": -1.5644819736480713, "logps/rejected": -5.987911701202393, "loss": 0.3782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0134408473968506, "rewards/margins": 0.7772123217582703, "rewards/rejected": 0.23622851073741913, "step": 4998 }, { "epoch": 2.7, "learning_rate": 2.536652142369179e-08, "logits/chosen": -2.0697391033172607, "logits/rejected": -2.0691111087799072, "logps/chosen": -2.838251829147339, "logps/rejected": -6.477967262268066, "loss": 0.2306, "rewards/accuracies": 1.0, "rewards/chosen": 1.5907139778137207, "rewards/margins": 1.3493413925170898, "rewards/rejected": 0.24137258529663086, "step": 4999 }, { "epoch": 2.7, "learning_rate": 2.5347520119905162e-08, "logits/chosen": -2.0423977375030518, "logits/rejected": -2.0446383953094482, "logps/chosen": -0.12053985893726349, "logps/rejected": -7.228383541107178, "loss": 0.4411, "rewards/accuracies": 1.0, "rewards/chosen": 0.7723554968833923, "rewards/margins": 0.5899276733398438, "rewards/rejected": 0.18242783844470978, "step": 5000 }, { "epoch": 2.7, "learning_rate": 2.5328523518742268e-08, "logits/chosen": -2.032489538192749, "logits/rejected": -2.0343940258026123, "logps/chosen": -1.3487532138824463, "logps/rejected": -3.5326201915740967, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 0.9444228410720825, "rewards/margins": 0.4309709668159485, "rewards/rejected": 0.513451874256134, "step": 5001 }, { "epoch": 2.7, "learning_rate": 2.530953162382683e-08, "logits/chosen": -2.204721450805664, "logits/rejected": -2.0973215103149414, "logps/chosen": -22.87103843688965, "logps/rejected": -4.139798164367676, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": 2.4942126274108887, "rewards/margins": 2.0888192653656006, "rewards/rejected": 0.4053933322429657, "step": 5002 }, { "epoch": 2.7, "learning_rate": 2.5290544438781673e-08, "logits/chosen": -2.0978546142578125, "logits/rejected": -2.089378595352173, "logps/chosen": -7.163996696472168, "logps/rejected": -6.770849704742432, "loss": 0.2531, "rewards/accuracies": 1.0, "rewards/chosen": 1.6563560962677002, "rewards/margins": 1.2448067665100098, "rewards/rejected": 0.41154932975769043, "step": 5003 }, { "epoch": 2.7, "learning_rate": 2.5271561967228738e-08, "logits/chosen": -2.248507022857666, "logits/rejected": -2.1107676029205322, "logps/chosen": -29.780223846435547, "logps/rejected": -1.395338535308838, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 3.2676846981048584, "rewards/margins": 2.557593822479248, "rewards/rejected": 0.7100909948348999, "step": 5004 }, { "epoch": 2.7, "learning_rate": 2.5252584212789054e-08, "logits/chosen": -2.023808717727661, "logits/rejected": -2.036397695541382, "logps/chosen": -26.700336456298828, "logps/rejected": -19.256271362304688, "loss": 0.4587, "rewards/accuracies": 1.0, "rewards/chosen": 1.1214821338653564, "rewards/margins": 0.5412474274635315, "rewards/rejected": 0.580234706401825, "step": 5005 }, { "epoch": 2.7, "learning_rate": 2.5233611179082748e-08, "logits/chosen": -2.0407817363739014, "logits/rejected": -2.0366761684417725, "logps/chosen": -3.211571216583252, "logps/rejected": -3.6510467529296875, "loss": 0.3081, "rewards/accuracies": 1.0, "rewards/chosen": 1.6573762893676758, "rewards/margins": 1.0191981792449951, "rewards/rejected": 0.6381781697273254, "step": 5006 }, { "epoch": 2.7, "learning_rate": 2.521464286972907e-08, "logits/chosen": -1.9551708698272705, "logits/rejected": -2.248870849609375, "logps/chosen": -0.6057404279708862, "logps/rejected": -0.6592769622802734, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8330095410346985, "rewards/margins": 0.021177470684051514, "rewards/rejected": 0.811832070350647, "step": 5007 }, { "epoch": 2.7, "learning_rate": 2.5195679288346324e-08, "logits/chosen": -2.0812716484069824, "logits/rejected": -2.3275322914123535, "logps/chosen": -0.9530253410339355, "logps/rejected": -4.634591102600098, "loss": 0.5579, "rewards/accuracies": 1.0, "rewards/chosen": 0.8524219393730164, "rewards/margins": 0.2917490005493164, "rewards/rejected": 0.5606729388237, "step": 5008 }, { "epoch": 2.7, "learning_rate": 2.5176720438551925e-08, "logits/chosen": -2.1831705570220947, "logits/rejected": -2.1979477405548096, "logps/chosen": -9.709112167358398, "logps/rejected": -10.955818176269531, "loss": 0.245, "rewards/accuracies": 1.0, "rewards/chosen": 2.0034587383270264, "rewards/margins": 1.2814373970031738, "rewards/rejected": 0.7220212817192078, "step": 5009 }, { "epoch": 2.7, "learning_rate": 2.515776632396245e-08, "logits/chosen": -2.070707082748413, "logits/rejected": -2.0755560398101807, "logps/chosen": -2.9319851398468018, "logps/rejected": -5.435877323150635, "loss": 0.3385, "rewards/accuracies": 1.0, "rewards/chosen": 1.3606692552566528, "rewards/margins": 0.9092516899108887, "rewards/rejected": 0.45141759514808655, "step": 5010 }, { "epoch": 2.7, "learning_rate": 2.5138816948193492e-08, "logits/chosen": -2.1581313610076904, "logits/rejected": -2.3067941665649414, "logps/chosen": -2.9863033294677734, "logps/rejected": -0.4590395987033844, "loss": 0.6693, "rewards/accuracies": 1.0, "rewards/chosen": 1.025971531867981, "rewards/margins": 0.04817932844161987, "rewards/rejected": 0.9777922034263611, "step": 5011 }, { "epoch": 2.7, "learning_rate": 2.5119872314859774e-08, "logits/chosen": -2.0523645877838135, "logits/rejected": -2.0555694103240967, "logps/chosen": -2.0143685340881348, "logps/rejected": -0.2788454592227936, "loss": 0.6532, "rewards/accuracies": 1.0, "rewards/chosen": 0.8362129330635071, "rewards/margins": 0.08147495985031128, "rewards/rejected": 0.7547379732131958, "step": 5012 }, { "epoch": 2.7, "learning_rate": 2.5100932427575117e-08, "logits/chosen": -2.1657094955444336, "logits/rejected": -2.127373695373535, "logps/chosen": -19.606239318847656, "logps/rejected": -3.3800387382507324, "loss": 0.1936, "rewards/accuracies": 1.0, "rewards/chosen": 2.1322021484375, "rewards/margins": 1.5433425903320312, "rewards/rejected": 0.5888596177101135, "step": 5013 }, { "epoch": 2.7, "learning_rate": 2.5081997289952416e-08, "logits/chosen": -2.0249311923980713, "logits/rejected": -2.2587220668792725, "logps/chosen": -0.3473937511444092, "logps/rejected": -4.958492279052734, "loss": 0.6238, "rewards/accuracies": 1.0, "rewards/chosen": 0.8325456976890564, "rewards/margins": 0.14386332035064697, "rewards/rejected": 0.6886823773384094, "step": 5014 }, { "epoch": 2.7, "learning_rate": 2.506306690560368e-08, "logits/chosen": -2.061276435852051, "logits/rejected": -2.2624640464782715, "logps/chosen": -0.9805986285209656, "logps/rejected": -0.9656267166137695, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8878308534622192, "rewards/margins": 0.009715676307678223, "rewards/rejected": 0.878115177154541, "step": 5015 }, { "epoch": 2.71, "learning_rate": 2.504414127814001e-08, "logits/chosen": -2.2279210090637207, "logits/rejected": -2.2295167446136475, "logps/chosen": -0.4280443787574768, "logps/rejected": -4.769611835479736, "loss": 0.4173, "rewards/accuracies": 1.0, "rewards/chosen": 0.9953336119651794, "rewards/margins": 0.6579462289810181, "rewards/rejected": 0.33738741278648376, "step": 5016 }, { "epoch": 2.71, "learning_rate": 2.5025220411171595e-08, "logits/chosen": -2.0469844341278076, "logits/rejected": -2.246556520462036, "logps/chosen": -2.918353796005249, "logps/rejected": -3.234781265258789, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.7701027989387512, "rewards/margins": 0.03271591663360596, "rewards/rejected": 0.7373868823051453, "step": 5017 }, { "epoch": 2.71, "learning_rate": 2.5006304308307708e-08, "logits/chosen": -2.1147689819335938, "logits/rejected": -2.1127662658691406, "logps/chosen": -1.6105384826660156, "logps/rejected": -9.98502254486084, "loss": 0.345, "rewards/accuracies": 1.0, "rewards/chosen": 0.9616096615791321, "rewards/margins": 0.8867581486701965, "rewards/rejected": 0.07485151290893555, "step": 5018 }, { "epoch": 2.71, "learning_rate": 2.4987392973156722e-08, "logits/chosen": -2.0165302753448486, "logits/rejected": -2.2547647953033447, "logps/chosen": -1.9670308828353882, "logps/rejected": -1.9673523902893066, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 1.2750929594039917, "rewards/margins": 0.02023780345916748, "rewards/rejected": 1.2548551559448242, "step": 5019 }, { "epoch": 2.71, "learning_rate": 2.4968486409326106e-08, "logits/chosen": -2.0728044509887695, "logits/rejected": -2.0806519985198975, "logps/chosen": -2.9066121578216553, "logps/rejected": -6.08577823638916, "loss": 0.3608, "rewards/accuracies": 1.0, "rewards/chosen": 1.7048338651657104, "rewards/margins": 0.833540678024292, "rewards/rejected": 0.8712931871414185, "step": 5020 }, { "epoch": 2.71, "learning_rate": 2.4949584620422403e-08, "logits/chosen": -2.085210084915161, "logits/rejected": -2.2609379291534424, "logps/chosen": -7.439651966094971, "logps/rejected": -1.0443261861801147, "loss": 0.9154, "rewards/accuracies": 0.0, "rewards/chosen": 0.7109947800636292, "rewards/margins": -0.40402477979660034, "rewards/rejected": 1.1150195598602295, "step": 5021 }, { "epoch": 2.71, "learning_rate": 2.4930687610051267e-08, "logits/chosen": -2.0555460453033447, "logits/rejected": -2.042835235595703, "logps/chosen": -4.792588233947754, "logps/rejected": -3.6973071098327637, "loss": 0.5058, "rewards/accuracies": 1.0, "rewards/chosen": 0.8756284117698669, "rewards/margins": 0.41811978816986084, "rewards/rejected": 0.4575086236000061, "step": 5022 }, { "epoch": 2.71, "learning_rate": 2.4911795381817415e-08, "logits/chosen": -2.028486490249634, "logits/rejected": -2.302199125289917, "logps/chosen": -5.675929546356201, "logps/rejected": -1.8644312620162964, "loss": 0.7623, "rewards/accuracies": 0.0, "rewards/chosen": 0.8782045245170593, "rewards/margins": -0.13386434316635132, "rewards/rejected": 1.0120688676834106, "step": 5023 }, { "epoch": 2.71, "learning_rate": 2.489290793932467e-08, "logits/chosen": -1.9477821588516235, "logits/rejected": -1.9281977415084839, "logps/chosen": -7.501984596252441, "logps/rejected": -0.8115469217300415, "loss": 0.5576, "rewards/accuracies": 1.0, "rewards/chosen": 1.3935046195983887, "rewards/margins": 0.29247522354125977, "rewards/rejected": 1.101029396057129, "step": 5024 }, { "epoch": 2.71, "learning_rate": 2.4874025286175937e-08, "logits/chosen": -2.0603981018066406, "logits/rejected": -2.0695109367370605, "logps/chosen": -0.8601369857788086, "logps/rejected": -3.33827805519104, "loss": 0.4851, "rewards/accuracies": 1.0, "rewards/chosen": 1.0854448080062866, "rewards/margins": 0.47097986936569214, "rewards/rejected": 0.6144649386405945, "step": 5025 }, { "epoch": 2.71, "learning_rate": 2.485514742597321e-08, "logits/chosen": -2.0273025035858154, "logits/rejected": -2.0275444984436035, "logps/chosen": -0.42260146141052246, "logps/rejected": -5.595334529876709, "loss": 0.4512, "rewards/accuracies": 1.0, "rewards/chosen": 0.8463045954704285, "rewards/margins": 0.5617877244949341, "rewards/rejected": 0.2845168709754944, "step": 5026 }, { "epoch": 2.71, "learning_rate": 2.4836274362317562e-08, "logits/chosen": -2.153212070465088, "logits/rejected": -2.15309476852417, "logps/chosen": -0.7748256325721741, "logps/rejected": -2.1959569454193115, "loss": 0.5717, "rewards/accuracies": 1.0, "rewards/chosen": 1.0933748483657837, "rewards/margins": 0.25963592529296875, "rewards/rejected": 0.8337389230728149, "step": 5027 }, { "epoch": 2.71, "learning_rate": 2.4817406098809153e-08, "logits/chosen": -2.052379846572876, "logits/rejected": -2.3359856605529785, "logps/chosen": -2.0328104496002197, "logps/rejected": -2.1495306491851807, "loss": 0.6636, "rewards/accuracies": 1.0, "rewards/chosen": 0.898974597454071, "rewards/margins": 0.060033202171325684, "rewards/rejected": 0.8389413952827454, "step": 5028 }, { "epoch": 2.71, "learning_rate": 2.4798542639047232e-08, "logits/chosen": -2.079965353012085, "logits/rejected": -2.3670623302459717, "logps/chosen": -0.8119465708732605, "logps/rejected": -0.8983203172683716, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.7782227396965027, "rewards/margins": 0.03148162364959717, "rewards/rejected": 0.7467411160469055, "step": 5029 }, { "epoch": 2.71, "learning_rate": 2.4779683986630124e-08, "logits/chosen": -2.0935370922088623, "logits/rejected": -2.1185672283172607, "logps/chosen": -3.1037678718566895, "logps/rejected": -6.236108779907227, "loss": 0.4682, "rewards/accuracies": 1.0, "rewards/chosen": 1.1839717626571655, "rewards/margins": 0.5156951546669006, "rewards/rejected": 0.6682766079902649, "step": 5030 }, { "epoch": 2.71, "learning_rate": 2.4760830145155253e-08, "logits/chosen": -2.149064302444458, "logits/rejected": -2.3372325897216797, "logps/chosen": -5.662109851837158, "logps/rejected": -5.56187105178833, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.6101462841033936, "rewards/margins": 0.014258205890655518, "rewards/rejected": 0.595888078212738, "step": 5031 }, { "epoch": 2.71, "learning_rate": 2.4741981118219086e-08, "logits/chosen": -2.054922342300415, "logits/rejected": -2.310821294784546, "logps/chosen": -1.0868971347808838, "logps/rejected": -1.089325189590454, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.9619799852371216, "rewards/margins": 0.011786282062530518, "rewards/rejected": 0.9501937031745911, "step": 5032 }, { "epoch": 2.71, "learning_rate": 2.4723136909417238e-08, "logits/chosen": -2.1550567150115967, "logits/rejected": -2.146868944168091, "logps/chosen": -2.744255781173706, "logps/rejected": -3.4981255531311035, "loss": 0.3874, "rewards/accuracies": 1.0, "rewards/chosen": 1.213820219039917, "rewards/margins": 0.7484690546989441, "rewards/rejected": 0.4653511643409729, "step": 5033 }, { "epoch": 2.72, "learning_rate": 2.4704297522344347e-08, "logits/chosen": -2.0151515007019043, "logits/rejected": -2.0174458026885986, "logps/chosen": -0.9631449580192566, "logps/rejected": -3.580965280532837, "loss": 0.4471, "rewards/accuracies": 1.0, "rewards/chosen": 1.1604559421539307, "rewards/margins": 0.5732197165489197, "rewards/rejected": 0.587236225605011, "step": 5034 }, { "epoch": 2.72, "learning_rate": 2.4685462960594156e-08, "logits/chosen": -1.9476701021194458, "logits/rejected": -1.9563461542129517, "logps/chosen": -1.2027461528778076, "logps/rejected": -3.723313331604004, "loss": 0.4733, "rewards/accuracies": 1.0, "rewards/chosen": 1.0487754344940186, "rewards/margins": 0.5021019577980042, "rewards/rejected": 0.5466734766960144, "step": 5035 }, { "epoch": 2.72, "learning_rate": 2.4666633227759504e-08, "logits/chosen": -2.1439287662506104, "logits/rejected": -2.150317907333374, "logps/chosen": -1.3990768194198608, "logps/rejected": -4.345587730407715, "loss": 0.4446, "rewards/accuracies": 1.0, "rewards/chosen": 0.9744974970817566, "rewards/margins": 0.5801741480827332, "rewards/rejected": 0.39432334899902344, "step": 5036 }, { "epoch": 2.72, "learning_rate": 2.4647808327432234e-08, "logits/chosen": -1.9957752227783203, "logits/rejected": -2.2610671520233154, "logps/chosen": -2.411468029022217, "logps/rejected": -2.299701690673828, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 1.006351351737976, "rewards/margins": 0.016302168369293213, "rewards/rejected": 0.9900491833686829, "step": 5037 }, { "epoch": 2.72, "learning_rate": 2.4628988263203353e-08, "logits/chosen": -2.135423421859741, "logits/rejected": -2.136948347091675, "logps/chosen": -0.19632330536842346, "logps/rejected": -6.815354347229004, "loss": 0.4032, "rewards/accuracies": 1.0, "rewards/chosen": 0.9301514029502869, "rewards/margins": 0.6999279260635376, "rewards/rejected": 0.23022346198558807, "step": 5038 }, { "epoch": 2.72, "learning_rate": 2.4610173038662914e-08, "logits/chosen": -2.137334108352661, "logits/rejected": -2.1491260528564453, "logps/chosen": -2.8079938888549805, "logps/rejected": -11.208797454833984, "loss": 0.5822, "rewards/accuracies": 1.0, "rewards/chosen": 1.2589879035949707, "rewards/margins": 0.23581278324127197, "rewards/rejected": 1.0231751203536987, "step": 5039 }, { "epoch": 2.72, "learning_rate": 2.4591362657400033e-08, "logits/chosen": -1.9831020832061768, "logits/rejected": -2.2892653942108154, "logps/chosen": -0.3608735501766205, "logps/rejected": -0.26570409536361694, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.8857281804084778, "rewards/margins": 0.01748257875442505, "rewards/rejected": 0.8682456016540527, "step": 5040 }, { "epoch": 2.72, "learning_rate": 2.4572557123002924e-08, "logits/chosen": -2.2300124168395996, "logits/rejected": -2.2132132053375244, "logps/chosen": -10.56468391418457, "logps/rejected": -6.378630638122559, "loss": 0.5267, "rewards/accuracies": 1.0, "rewards/chosen": 1.242975115776062, "rewards/margins": 0.36630386114120483, "rewards/rejected": 0.8766712546348572, "step": 5041 }, { "epoch": 2.72, "learning_rate": 2.455375643905887e-08, "logits/chosen": -2.1968278884887695, "logits/rejected": -2.1934874057769775, "logps/chosen": -6.77426815032959, "logps/rejected": -5.468247413635254, "loss": 0.3547, "rewards/accuracies": 1.0, "rewards/chosen": 1.259257197380066, "rewards/margins": 0.8538506031036377, "rewards/rejected": 0.40540656447410583, "step": 5042 }, { "epoch": 2.72, "learning_rate": 2.4534960609154198e-08, "logits/chosen": -2.0088651180267334, "logits/rejected": -2.352783203125, "logps/chosen": -0.2643241286277771, "logps/rejected": -0.28904056549072266, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8796277046203613, "rewards/margins": 0.006556868553161621, "rewards/rejected": 0.8730708360671997, "step": 5043 }, { "epoch": 2.72, "learning_rate": 2.451616963687438e-08, "logits/chosen": -1.9869650602340698, "logits/rejected": -1.9903764724731445, "logps/chosen": -0.14035694301128387, "logps/rejected": -8.765475273132324, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": 0.8388518691062927, "rewards/margins": 0.6946800351142883, "rewards/rejected": 0.1441718190908432, "step": 5044 }, { "epoch": 2.72, "learning_rate": 2.4497383525803898e-08, "logits/chosen": -2.102196216583252, "logits/rejected": -2.2601869106292725, "logps/chosen": -0.2903987169265747, "logps/rejected": -0.26880958676338196, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.9663301706314087, "rewards/margins": 0.006185948848724365, "rewards/rejected": 0.9601442217826843, "step": 5045 }, { "epoch": 2.72, "learning_rate": 2.4478602279526328e-08, "logits/chosen": -2.019038677215576, "logits/rejected": -2.016383409500122, "logps/chosen": -1.1183274984359741, "logps/rejected": -4.693351745605469, "loss": 0.4326, "rewards/accuracies": 1.0, "rewards/chosen": 1.1381268501281738, "rewards/margins": 0.6138733625411987, "rewards/rejected": 0.5242534875869751, "step": 5046 }, { "epoch": 2.72, "learning_rate": 2.445982590162432e-08, "logits/chosen": -2.0642971992492676, "logits/rejected": -2.0692195892333984, "logps/chosen": -6.693309307098389, "logps/rejected": -8.794824600219727, "loss": 0.2466, "rewards/accuracies": 1.0, "rewards/chosen": 1.8320194482803345, "rewards/margins": 1.2742960453033447, "rewards/rejected": 0.557723343372345, "step": 5047 }, { "epoch": 2.72, "learning_rate": 2.4441054395679584e-08, "logits/chosen": -2.0569570064544678, "logits/rejected": -2.1263821125030518, "logps/chosen": -2.475480794906616, "logps/rejected": -26.231035232543945, "loss": 0.15, "rewards/accuracies": 1.0, "rewards/chosen": 1.6879323720932007, "rewards/margins": 1.821476697921753, "rewards/rejected": -0.13354435563087463, "step": 5048 }, { "epoch": 2.72, "learning_rate": 2.4422287765272914e-08, "logits/chosen": -2.008877992630005, "logits/rejected": -2.2527668476104736, "logps/chosen": -1.701916217803955, "logps/rejected": -1.698451042175293, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.6659687757492065, "rewards/margins": 0.01947683095932007, "rewards/rejected": 0.6464919447898865, "step": 5049 }, { "epoch": 2.72, "learning_rate": 2.4403526013984176e-08, "logits/chosen": -2.091308116912842, "logits/rejected": -2.0969204902648926, "logps/chosen": -2.4883859157562256, "logps/rejected": -4.667409420013428, "loss": 0.4894, "rewards/accuracies": 1.0, "rewards/chosen": 1.0417510271072388, "rewards/margins": 0.45981669425964355, "rewards/rejected": 0.5819343328475952, "step": 5050 }, { "epoch": 2.72, "learning_rate": 2.4384769145392288e-08, "logits/chosen": -1.9661685228347778, "logits/rejected": -2.2860629558563232, "logps/chosen": -0.18702208995819092, "logps/rejected": -0.20518368482589722, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 1.0560675859451294, "rewards/margins": -0.0018035173416137695, "rewards/rejected": 1.0578711032867432, "step": 5051 }, { "epoch": 2.72, "learning_rate": 2.4366017163075247e-08, "logits/chosen": -2.0190999507904053, "logits/rejected": -2.0291748046875, "logps/chosen": -1.4629805088043213, "logps/rejected": -3.2563273906707764, "loss": 0.526, "rewards/accuracies": 1.0, "rewards/chosen": 1.0269393920898438, "rewards/margins": 0.36788129806518555, "rewards/rejected": 0.6590580940246582, "step": 5052 }, { "epoch": 2.73, "learning_rate": 2.4347270070610122e-08, "logits/chosen": -2.1828482151031494, "logits/rejected": -2.1882143020629883, "logps/chosen": -1.8799773454666138, "logps/rejected": -4.225984573364258, "loss": 0.4749, "rewards/accuracies": 1.0, "rewards/chosen": 0.9481436610221863, "rewards/margins": 0.49782294034957886, "rewards/rejected": 0.4503207206726074, "step": 5053 }, { "epoch": 2.73, "learning_rate": 2.4328527871573047e-08, "logits/chosen": -2.036156415939331, "logits/rejected": -2.3414533138275146, "logps/chosen": -5.804940223693848, "logps/rejected": -5.019713401794434, "loss": 0.7114, "rewards/accuracies": 0.0, "rewards/chosen": 0.6703974604606628, "rewards/margins": -0.03616201877593994, "rewards/rejected": 0.7065594792366028, "step": 5054 }, { "epoch": 2.73, "learning_rate": 2.430979056953922e-08, "logits/chosen": -2.0468180179595947, "logits/rejected": -2.0375218391418457, "logps/chosen": -6.112154483795166, "logps/rejected": -4.861518383026123, "loss": 0.4636, "rewards/accuracies": 1.0, "rewards/chosen": 1.1335138082504272, "rewards/margins": 0.5280747413635254, "rewards/rejected": 0.6054390668869019, "step": 5055 }, { "epoch": 2.73, "learning_rate": 2.4291058168082896e-08, "logits/chosen": -2.047351598739624, "logits/rejected": -2.045532464981079, "logps/chosen": -0.49445196986198425, "logps/rejected": -1.796867847442627, "loss": 0.5811, "rewards/accuracies": 1.0, "rewards/chosen": 1.023807168006897, "rewards/margins": 0.23826813697814941, "rewards/rejected": 0.7855390310287476, "step": 5056 }, { "epoch": 2.73, "learning_rate": 2.4272330670777415e-08, "logits/chosen": -2.0908679962158203, "logits/rejected": -2.0778000354766846, "logps/chosen": -10.532999038696289, "logps/rejected": -4.691189289093018, "loss": 0.2597, "rewards/accuracies": 1.0, "rewards/chosen": 1.6743913888931274, "rewards/margins": 1.2155698537826538, "rewards/rejected": 0.45882153511047363, "step": 5057 }, { "epoch": 2.73, "learning_rate": 2.4253608081195166e-08, "logits/chosen": -1.9796435832977295, "logits/rejected": -1.9831420183181763, "logps/chosen": -2.8303232192993164, "logps/rejected": -5.418199062347412, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 1.4076653718948364, "rewards/margins": 0.6527583003044128, "rewards/rejected": 0.7549070715904236, "step": 5058 }, { "epoch": 2.73, "learning_rate": 2.423489040290761e-08, "logits/chosen": -2.2630743980407715, "logits/rejected": -2.338418960571289, "logps/chosen": -0.9545767307281494, "logps/rejected": -1.062477946281433, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 1.059688925743103, "rewards/margins": -0.00736236572265625, "rewards/rejected": 1.0670512914657593, "step": 5059 }, { "epoch": 2.73, "learning_rate": 2.4216177639485265e-08, "logits/chosen": -1.9981197118759155, "logits/rejected": -2.255718469619751, "logps/chosen": -0.6803346872329712, "logps/rejected": -0.705917477607727, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.7813124060630798, "rewards/margins": -0.013225317001342773, "rewards/rejected": 0.7945377230644226, "step": 5060 }, { "epoch": 2.73, "learning_rate": 2.4197469794497716e-08, "logits/chosen": -1.978377342224121, "logits/rejected": -2.2601795196533203, "logps/chosen": -0.13565769791603088, "logps/rejected": -0.12926974892616272, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.9520965814590454, "rewards/margins": 0.012636780738830566, "rewards/rejected": 0.9394598007202148, "step": 5061 }, { "epoch": 2.73, "learning_rate": 2.4178766871513605e-08, "logits/chosen": -2.048130989074707, "logits/rejected": -2.064230442047119, "logps/chosen": -1.6167662143707275, "logps/rejected": -11.48634147644043, "loss": 0.5114, "rewards/accuracies": 1.0, "rewards/chosen": 1.210049033164978, "rewards/margins": 0.4039968252182007, "rewards/rejected": 0.8060522079467773, "step": 5062 }, { "epoch": 2.73, "learning_rate": 2.4160068874100647e-08, "logits/chosen": -2.1664133071899414, "logits/rejected": -2.1659910678863525, "logps/chosen": -0.8410841822624207, "logps/rejected": -2.0226070880889893, "loss": 0.6692, "rewards/accuracies": 1.0, "rewards/chosen": 0.901004433631897, "rewards/margins": 0.04843825101852417, "rewards/rejected": 0.8525661826133728, "step": 5063 }, { "epoch": 2.73, "learning_rate": 2.4141375805825603e-08, "logits/chosen": -2.0452332496643066, "logits/rejected": -2.0541040897369385, "logps/chosen": -1.061003565788269, "logps/rejected": -4.240513324737549, "loss": 0.4462, "rewards/accuracies": 1.0, "rewards/chosen": 1.072080135345459, "rewards/margins": 0.5754699110984802, "rewards/rejected": 0.49661022424697876, "step": 5064 }, { "epoch": 2.73, "learning_rate": 2.41226876702543e-08, "logits/chosen": -2.143270492553711, "logits/rejected": -2.1366496086120605, "logps/chosen": -3.402912139892578, "logps/rejected": -6.129821300506592, "loss": 0.2945, "rewards/accuracies": 1.0, "rewards/chosen": 1.4984403848648071, "rewards/margins": 1.0715563297271729, "rewards/rejected": 0.4268840253353119, "step": 5065 }, { "epoch": 2.73, "learning_rate": 2.4104004470951622e-08, "logits/chosen": -2.082263231277466, "logits/rejected": -2.282684564590454, "logps/chosen": -0.452446848154068, "logps/rejected": -0.48188042640686035, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 1.178529143333435, "rewards/margins": 0.007494449615478516, "rewards/rejected": 1.1710346937179565, "step": 5066 }, { "epoch": 2.73, "learning_rate": 2.4085326211481517e-08, "logits/chosen": -2.1750123500823975, "logits/rejected": -2.070094108581543, "logps/chosen": -26.492210388183594, "logps/rejected": -6.962762832641602, "loss": 0.1225, "rewards/accuracies": 1.0, "rewards/chosen": 2.509615421295166, "rewards/margins": 2.0378267765045166, "rewards/rejected": 0.471788614988327, "step": 5067 }, { "epoch": 2.73, "learning_rate": 2.406665289540698e-08, "logits/chosen": -2.08789324760437, "logits/rejected": -2.0869548320770264, "logps/chosen": -0.8061004877090454, "logps/rejected": -1.8534237146377563, "loss": 0.6588, "rewards/accuracies": 1.0, "rewards/chosen": 0.9231413006782532, "rewards/margins": 0.07000696659088135, "rewards/rejected": 0.8531343340873718, "step": 5068 }, { "epoch": 2.73, "learning_rate": 2.4047984526290078e-08, "logits/chosen": -2.0372602939605713, "logits/rejected": -2.0416462421417236, "logps/chosen": -0.9197688102722168, "logps/rejected": -4.6538519859313965, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9872835278511047, "rewards/margins": 0.7324244976043701, "rewards/rejected": 0.2548590302467346, "step": 5069 }, { "epoch": 2.73, "learning_rate": 2.402932110769192e-08, "logits/chosen": -2.165771722793579, "logits/rejected": -2.1569557189941406, "logps/chosen": -4.733434200286865, "logps/rejected": -4.760244369506836, "loss": 0.6112, "rewards/accuracies": 1.0, "rewards/chosen": 0.9166492819786072, "rewards/margins": 0.17123699188232422, "rewards/rejected": 0.745412290096283, "step": 5070 }, { "epoch": 2.74, "learning_rate": 2.4010662643172674e-08, "logits/chosen": -2.348418712615967, "logits/rejected": -2.097355365753174, "logps/chosen": -45.78105545043945, "logps/rejected": -6.734318256378174, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": 2.2799794673919678, "rewards/margins": 1.8944450616836548, "rewards/rejected": 0.3855344355106354, "step": 5071 }, { "epoch": 2.74, "learning_rate": 2.3992009136291575e-08, "logits/chosen": -2.0067343711853027, "logits/rejected": -2.2919819355010986, "logps/chosen": -0.29737311601638794, "logps/rejected": -0.3410099446773529, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.8518552780151367, "rewards/margins": 0.011329174041748047, "rewards/rejected": 0.8405261039733887, "step": 5072 }, { "epoch": 2.74, "learning_rate": 2.39733605906069e-08, "logits/chosen": -2.114872694015503, "logits/rejected": -2.264657735824585, "logps/chosen": -1.3499654531478882, "logps/rejected": -1.5662658214569092, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8818621039390564, "rewards/margins": 0.006443917751312256, "rewards/rejected": 0.8754181861877441, "step": 5073 }, { "epoch": 2.74, "learning_rate": 2.395471700967598e-08, "logits/chosen": -2.1095848083496094, "logits/rejected": -2.0966544151306152, "logps/chosen": -6.678961753845215, "logps/rejected": -3.2659518718719482, "loss": 0.4376, "rewards/accuracies": 1.0, "rewards/chosen": 1.1533658504486084, "rewards/margins": 0.5996744632720947, "rewards/rejected": 0.5536913871765137, "step": 5074 }, { "epoch": 2.74, "learning_rate": 2.39360783970552e-08, "logits/chosen": -2.0327553749084473, "logits/rejected": -2.2458879947662354, "logps/chosen": -0.5004948973655701, "logps/rejected": -0.5602366924285889, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.9630851149559021, "rewards/margins": 0.029806852340698242, "rewards/rejected": 0.9332782626152039, "step": 5075 }, { "epoch": 2.74, "learning_rate": 2.3917444756299996e-08, "logits/chosen": -2.0688087940216064, "logits/rejected": -2.064356803894043, "logps/chosen": -13.784321784973145, "logps/rejected": -10.250226020812988, "loss": 0.234, "rewards/accuracies": 1.0, "rewards/chosen": 1.4993761777877808, "rewards/margins": 1.3332439661026, "rewards/rejected": 0.16613216698169708, "step": 5076 }, { "epoch": 2.74, "learning_rate": 2.3898816090964846e-08, "logits/chosen": -2.3214619159698486, "logits/rejected": -2.352750539779663, "logps/chosen": -0.7845684289932251, "logps/rejected": -0.6514097452163696, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.7655726075172424, "rewards/margins": -0.002644836902618408, "rewards/rejected": 0.7682174444198608, "step": 5077 }, { "epoch": 2.74, "learning_rate": 2.3880192404603334e-08, "logits/chosen": -2.067706823348999, "logits/rejected": -2.073516607284546, "logps/chosen": -1.8134987354278564, "logps/rejected": -4.317776203155518, "loss": 0.3085, "rewards/accuracies": 1.0, "rewards/chosen": 1.5045832395553589, "rewards/margins": 1.017634391784668, "rewards/rejected": 0.48694881796836853, "step": 5078 }, { "epoch": 2.74, "learning_rate": 2.3861573700768018e-08, "logits/chosen": -1.980776309967041, "logits/rejected": -1.9817149639129639, "logps/chosen": -3.0045695304870605, "logps/rejected": -0.5783933401107788, "loss": 0.4617, "rewards/accuracies": 1.0, "rewards/chosen": 1.4505256414413452, "rewards/margins": 0.5330437421798706, "rewards/rejected": 0.9174818992614746, "step": 5079 }, { "epoch": 2.74, "learning_rate": 2.384295998301055e-08, "logits/chosen": -2.084958553314209, "logits/rejected": -2.27836012840271, "logps/chosen": -3.5505192279815674, "logps/rejected": -2.038182020187378, "loss": 0.7851, "rewards/accuracies": 0.0, "rewards/chosen": 0.976288914680481, "rewards/margins": -0.17623567581176758, "rewards/rejected": 1.1525245904922485, "step": 5080 }, { "epoch": 2.74, "learning_rate": 2.3824351254881615e-08, "logits/chosen": -2.2470650672912598, "logits/rejected": -2.0618114471435547, "logps/chosen": -55.36753845214844, "logps/rejected": -0.4842017889022827, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": 3.004655599594116, "rewards/margins": 2.17158842086792, "rewards/rejected": 0.8330672383308411, "step": 5081 }, { "epoch": 2.74, "learning_rate": 2.3805747519930952e-08, "logits/chosen": -2.097627639770508, "logits/rejected": -2.0953197479248047, "logps/chosen": -5.252356052398682, "logps/rejected": -6.099575519561768, "loss": 0.2727, "rewards/accuracies": 1.0, "rewards/chosen": 1.6947721242904663, "rewards/margins": 1.1599462032318115, "rewards/rejected": 0.53482586145401, "step": 5082 }, { "epoch": 2.74, "learning_rate": 2.378714878170737e-08, "logits/chosen": -2.07490873336792, "logits/rejected": -2.3168532848358154, "logps/chosen": -0.8756150603294373, "logps/rejected": -0.9121875166893005, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.9869802594184875, "rewards/margins": -0.007640361785888672, "rewards/rejected": 0.9946206212043762, "step": 5083 }, { "epoch": 2.74, "learning_rate": 2.3768555043758658e-08, "logits/chosen": -2.006175994873047, "logits/rejected": -2.0011422634124756, "logps/chosen": -6.275274276733398, "logps/rejected": -3.715585947036743, "loss": 0.407, "rewards/accuracies": 1.0, "rewards/chosen": 1.7031911611557007, "rewards/margins": 0.6885573863983154, "rewards/rejected": 1.0146337747573853, "step": 5084 }, { "epoch": 2.74, "learning_rate": 2.3749966309631713e-08, "logits/chosen": -2.052213430404663, "logits/rejected": -2.308359384536743, "logps/chosen": -1.4801796674728394, "logps/rejected": -1.2553234100341797, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 1.0810608863830566, "rewards/margins": 0.008138656616210938, "rewards/rejected": 1.0729222297668457, "step": 5085 }, { "epoch": 2.74, "learning_rate": 2.3731382582872466e-08, "logits/chosen": -2.0488314628601074, "logits/rejected": -2.0441339015960693, "logps/chosen": -3.3471341133117676, "logps/rejected": -1.47169828414917, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": 0.8786724209785461, "rewards/margins": 0.12955611944198608, "rewards/rejected": 0.7491163015365601, "step": 5086 }, { "epoch": 2.74, "learning_rate": 2.3712803867025878e-08, "logits/chosen": -2.1176183223724365, "logits/rejected": -2.1694576740264893, "logps/chosen": -6.77698278427124, "logps/rejected": -9.7736234664917, "loss": 0.3547, "rewards/accuracies": 1.0, "rewards/chosen": 1.5200719833374023, "rewards/margins": 0.8538016080856323, "rewards/rejected": 0.66627037525177, "step": 5087 }, { "epoch": 2.74, "learning_rate": 2.3694230165635947e-08, "logits/chosen": -2.0416319370269775, "logits/rejected": -2.3062307834625244, "logps/chosen": -0.8382378816604614, "logps/rejected": -0.7458001375198364, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.9777867197990417, "rewards/margins": 0.002867758274078369, "rewards/rejected": 0.9749189615249634, "step": 5088 }, { "epoch": 2.74, "learning_rate": 2.367566148224577e-08, "logits/chosen": -2.0581960678100586, "logits/rejected": -2.318126916885376, "logps/chosen": -0.3657029867172241, "logps/rejected": -0.3760383725166321, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.8422954678535461, "rewards/margins": 0.027270615100860596, "rewards/rejected": 0.8150248527526855, "step": 5089 }, { "epoch": 2.75, "learning_rate": 2.3657097820397416e-08, "logits/chosen": -2.1249499320983887, "logits/rejected": -2.1238579750061035, "logps/chosen": -1.8919851779937744, "logps/rejected": -4.890442848205566, "loss": 0.2656, "rewards/accuracies": 1.0, "rewards/chosen": 1.7029268741607666, "rewards/margins": 1.1898442506790161, "rewards/rejected": 0.5130826234817505, "step": 5090 }, { "epoch": 2.75, "learning_rate": 2.363853918363204e-08, "logits/chosen": -2.1151769161224365, "logits/rejected": -2.120962381362915, "logps/chosen": -0.662282407283783, "logps/rejected": -2.695605516433716, "loss": 0.4894, "rewards/accuracies": 1.0, "rewards/chosen": 1.104173183441162, "rewards/margins": 0.4598439931869507, "rewards/rejected": 0.6443291902542114, "step": 5091 }, { "epoch": 2.75, "learning_rate": 2.361998557548982e-08, "logits/chosen": -2.2061967849731445, "logits/rejected": -2.2166385650634766, "logps/chosen": -2.4652442932128906, "logps/rejected": -5.1753435134887695, "loss": 0.3254, "rewards/accuracies": 1.0, "rewards/chosen": 1.3653202056884766, "rewards/margins": 0.9557702541351318, "rewards/rejected": 0.40954992175102234, "step": 5092 }, { "epoch": 2.75, "learning_rate": 2.3601436999509976e-08, "logits/chosen": -2.0534822940826416, "logits/rejected": -2.3243229389190674, "logps/chosen": -0.6395016312599182, "logps/rejected": -0.5965932607650757, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 1.092844843864441, "rewards/margins": 0.017657995223999023, "rewards/rejected": 1.075186848640442, "step": 5093 }, { "epoch": 2.75, "learning_rate": 2.3582893459230773e-08, "logits/chosen": -2.0730185508728027, "logits/rejected": -2.237515926361084, "logps/chosen": -0.5172083377838135, "logps/rejected": -0.5116361379623413, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.929914653301239, "rewards/margins": 0.02659785747528076, "rewards/rejected": 0.9033167958259583, "step": 5094 }, { "epoch": 2.75, "learning_rate": 2.356435495818952e-08, "logits/chosen": -1.9917787313461304, "logits/rejected": -1.9875578880310059, "logps/chosen": -7.083817481994629, "logps/rejected": -3.9015653133392334, "loss": 0.2831, "rewards/accuracies": 1.0, "rewards/chosen": 1.6865910291671753, "rewards/margins": 1.1169909238815308, "rewards/rejected": 0.5696001052856445, "step": 5095 }, { "epoch": 2.75, "learning_rate": 2.354582149992255e-08, "logits/chosen": -2.0817766189575195, "logits/rejected": -2.0885469913482666, "logps/chosen": -2.682380199432373, "logps/rejected": -5.935777187347412, "loss": 0.4461, "rewards/accuracies": 1.0, "rewards/chosen": 0.9473289847373962, "rewards/margins": 0.5759249925613403, "rewards/rejected": 0.3714040219783783, "step": 5096 }, { "epoch": 2.75, "learning_rate": 2.3527293087965255e-08, "logits/chosen": -2.3542563915252686, "logits/rejected": -2.2068722248077393, "logps/chosen": -19.26520538330078, "logps/rejected": -1.710194706916809, "loss": 0.1476, "rewards/accuracies": 1.0, "rewards/chosen": 2.4861698150634766, "rewards/margins": 1.8382105827331543, "rewards/rejected": 0.6479591727256775, "step": 5097 }, { "epoch": 2.75, "learning_rate": 2.3508769725852034e-08, "logits/chosen": -1.972753882408142, "logits/rejected": -1.9748982191085815, "logps/chosen": -2.671872138977051, "logps/rejected": -0.5299164056777954, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.7981392741203308, "rewards/margins": 0.01439279317855835, "rewards/rejected": 0.7837464809417725, "step": 5098 }, { "epoch": 2.75, "learning_rate": 2.3490251417116357e-08, "logits/chosen": -2.054086446762085, "logits/rejected": -2.264569044113159, "logps/chosen": -0.3832041025161743, "logps/rejected": -0.3620455861091614, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 1.0134389400482178, "rewards/margins": 0.017390668392181396, "rewards/rejected": 0.9960482716560364, "step": 5099 }, { "epoch": 2.75, "learning_rate": 2.347173816529071e-08, "logits/chosen": -2.1546077728271484, "logits/rejected": -2.31538987159729, "logps/chosen": -2.847935438156128, "logps/rejected": -3.110159158706665, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 0.9496140480041504, "rewards/margins": 0.04676932096481323, "rewards/rejected": 0.9028447270393372, "step": 5100 }, { "epoch": 2.75, "learning_rate": 2.3453229973906613e-08, "logits/chosen": -2.221080780029297, "logits/rejected": -2.2363405227661133, "logps/chosen": -2.5473015308380127, "logps/rejected": -10.249791145324707, "loss": 0.4153, "rewards/accuracies": 1.0, "rewards/chosen": 1.4673211574554443, "rewards/margins": 0.6640282273292542, "rewards/rejected": 0.8032929301261902, "step": 5101 }, { "epoch": 2.75, "learning_rate": 2.3434726846494634e-08, "logits/chosen": -2.006300210952759, "logits/rejected": -1.9931774139404297, "logps/chosen": -0.7083048820495605, "logps/rejected": -3.7904553413391113, "loss": 0.5145, "rewards/accuracies": 1.0, "rewards/chosen": 1.0379772186279297, "rewards/margins": 0.3962823152542114, "rewards/rejected": 0.6416949033737183, "step": 5102 }, { "epoch": 2.75, "learning_rate": 2.3416228786584358e-08, "logits/chosen": -2.1135029792785645, "logits/rejected": -2.116187334060669, "logps/chosen": -3.4057297706604004, "logps/rejected": -0.8893408179283142, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 0.7374147772789001, "rewards/margins": 0.04603683948516846, "rewards/rejected": 0.6913779377937317, "step": 5103 }, { "epoch": 2.75, "learning_rate": 2.3397735797704422e-08, "logits/chosen": -2.0207300186157227, "logits/rejected": -2.313279628753662, "logps/chosen": -0.982756495475769, "logps/rejected": -1.0199581384658813, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9421697854995728, "rewards/margins": 0.0010636448860168457, "rewards/rejected": 0.9411061406135559, "step": 5104 }, { "epoch": 2.75, "learning_rate": 2.3379247883382476e-08, "logits/chosen": -2.064720392227173, "logits/rejected": -2.0688302516937256, "logps/chosen": -0.16777953505516052, "logps/rejected": -5.872118949890137, "loss": 0.4962, "rewards/accuracies": 1.0, "rewards/chosen": 0.9433790445327759, "rewards/margins": 0.44245415925979614, "rewards/rejected": 0.5009248852729797, "step": 5105 }, { "epoch": 2.75, "learning_rate": 2.336076504714522e-08, "logits/chosen": -2.069206714630127, "logits/rejected": -2.071148633956909, "logps/chosen": -1.1706708669662476, "logps/rejected": -3.356196880340576, "loss": 0.5102, "rewards/accuracies": 1.0, "rewards/chosen": 0.8985449075698853, "rewards/margins": 0.40714794397354126, "rewards/rejected": 0.491396963596344, "step": 5106 }, { "epoch": 2.75, "learning_rate": 2.3342287292518375e-08, "logits/chosen": -2.3037171363830566, "logits/rejected": -2.3900465965270996, "logps/chosen": -7.276435852050781, "logps/rejected": -12.788232803344727, "loss": 0.5957, "rewards/accuracies": 1.0, "rewards/chosen": 1.2522106170654297, "rewards/margins": 0.20549368858337402, "rewards/rejected": 1.0467169284820557, "step": 5107 }, { "epoch": 2.76, "learning_rate": 2.3323814623026695e-08, "logits/chosen": -2.1599044799804688, "logits/rejected": -2.289769172668457, "logps/chosen": -2.3388307094573975, "logps/rejected": -0.4436195194721222, "loss": 0.7116, "rewards/accuracies": 0.0, "rewards/chosen": 0.8837170600891113, "rewards/margins": -0.036475300788879395, "rewards/rejected": 0.9201923608779907, "step": 5108 }, { "epoch": 2.76, "learning_rate": 2.3305347042193958e-08, "logits/chosen": -2.017932176589966, "logits/rejected": -2.003946304321289, "logps/chosen": -5.383712291717529, "logps/rejected": -6.281821250915527, "loss": 0.2984, "rewards/accuracies": 1.0, "rewards/chosen": 1.5966230630874634, "rewards/margins": 1.0565330982208252, "rewards/rejected": 0.5400899052619934, "step": 5109 }, { "epoch": 2.76, "learning_rate": 2.3286884553542968e-08, "logits/chosen": -2.0255367755889893, "logits/rejected": -2.2755086421966553, "logps/chosen": -1.7126797437667847, "logps/rejected": -1.3562920093536377, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.8676055073738098, "rewards/margins": -0.00116652250289917, "rewards/rejected": 0.868772029876709, "step": 5110 }, { "epoch": 2.76, "learning_rate": 2.3268427160595598e-08, "logits/chosen": -2.047189950942993, "logits/rejected": -2.0525336265563965, "logps/chosen": -1.7802445888519287, "logps/rejected": -3.7786176204681396, "loss": 0.4464, "rewards/accuracies": 1.0, "rewards/chosen": 1.0477527379989624, "rewards/margins": 0.5750175714492798, "rewards/rejected": 0.47273513674736023, "step": 5111 }, { "epoch": 2.76, "learning_rate": 2.3249974866872707e-08, "logits/chosen": -2.0294036865234375, "logits/rejected": -2.3146307468414307, "logps/chosen": -0.16571876406669617, "logps/rejected": -0.18967419862747192, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9522027373313904, "rewards/margins": 0.009220600128173828, "rewards/rejected": 0.9429821372032166, "step": 5112 }, { "epoch": 2.76, "learning_rate": 2.3231527675894203e-08, "logits/chosen": -2.0327651500701904, "logits/rejected": -2.021641492843628, "logps/chosen": -6.0244317054748535, "logps/rejected": -3.703442096710205, "loss": 0.5596, "rewards/accuracies": 1.0, "rewards/chosen": 1.1648447513580322, "rewards/margins": 0.28762513399124146, "rewards/rejected": 0.8772196173667908, "step": 5113 }, { "epoch": 2.76, "learning_rate": 2.3213085591178977e-08, "logits/chosen": -2.0750298500061035, "logits/rejected": -2.269451379776001, "logps/chosen": -1.1023168563842773, "logps/rejected": -6.212194442749023, "loss": 0.62, "rewards/accuracies": 1.0, "rewards/chosen": 0.8771690726280212, "rewards/margins": 0.1521472930908203, "rewards/rejected": 0.7250217795372009, "step": 5114 }, { "epoch": 2.76, "learning_rate": 2.3194648616244994e-08, "logits/chosen": -2.174337148666382, "logits/rejected": -2.361100912094116, "logps/chosen": -5.709299087524414, "logps/rejected": -6.034134864807129, "loss": 0.6631, "rewards/accuracies": 1.0, "rewards/chosen": 0.8372575640678406, "rewards/margins": 0.06109905242919922, "rewards/rejected": 0.7761585116386414, "step": 5115 }, { "epoch": 2.76, "learning_rate": 2.317621675460923e-08, "logits/chosen": -2.1616992950439453, "logits/rejected": -2.1533710956573486, "logps/chosen": -7.164791107177734, "logps/rejected": -1.7612082958221436, "loss": 0.3975, "rewards/accuracies": 1.0, "rewards/chosen": 1.4478470087051392, "rewards/margins": 0.7172630429267883, "rewards/rejected": 0.7305839657783508, "step": 5116 }, { "epoch": 2.76, "learning_rate": 2.315779000978768e-08, "logits/chosen": -1.9660931825637817, "logits/rejected": -1.9671941995620728, "logps/chosen": -1.4391920566558838, "logps/rejected": -0.724607527256012, "loss": 0.6232, "rewards/accuracies": 1.0, "rewards/chosen": 1.095221757888794, "rewards/margins": 0.14518463611602783, "rewards/rejected": 0.9500371217727661, "step": 5117 }, { "epoch": 2.76, "learning_rate": 2.3139368385295377e-08, "logits/chosen": -2.1020822525024414, "logits/rejected": -2.109470844268799, "logps/chosen": -5.145431041717529, "logps/rejected": -11.575782775878906, "loss": 0.3714, "rewards/accuracies": 1.0, "rewards/chosen": 1.1195766925811768, "rewards/margins": 0.7990458011627197, "rewards/rejected": 0.32053089141845703, "step": 5118 }, { "epoch": 2.76, "learning_rate": 2.312095188464637e-08, "logits/chosen": -2.119396448135376, "logits/rejected": -2.3336853981018066, "logps/chosen": -0.5419253706932068, "logps/rejected": -0.5686027407646179, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 1.0011717081069946, "rewards/margins": 0.029838860034942627, "rewards/rejected": 0.971332848072052, "step": 5119 }, { "epoch": 2.76, "learning_rate": 2.310254051135371e-08, "logits/chosen": -2.075381278991699, "logits/rejected": -2.1446449756622314, "logps/chosen": -7.165008068084717, "logps/rejected": -18.63530731201172, "loss": 0.3321, "rewards/accuracies": 1.0, "rewards/chosen": 1.3779354095458984, "rewards/margins": 0.9318431615829468, "rewards/rejected": 0.4460922181606293, "step": 5120 }, { "epoch": 2.76, "learning_rate": 2.3084134268929506e-08, "logits/chosen": -2.1330301761627197, "logits/rejected": -2.2532591819763184, "logps/chosen": -2.1076431274414062, "logps/rejected": -1.928166151046753, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.9667035937309265, "rewards/margins": 0.028385400772094727, "rewards/rejected": 0.9383181929588318, "step": 5121 }, { "epoch": 2.76, "learning_rate": 2.3065733160884843e-08, "logits/chosen": -2.067183017730713, "logits/rejected": -2.2979185581207275, "logps/chosen": -0.36357733607292175, "logps/rejected": -0.35810691118240356, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8379611372947693, "rewards/margins": 0.0017778873443603516, "rewards/rejected": 0.8361832499504089, "step": 5122 }, { "epoch": 2.76, "learning_rate": 2.30473371907299e-08, "logits/chosen": -2.2608113288879395, "logits/rejected": -2.250659227371216, "logps/chosen": -1.0595507621765137, "logps/rejected": -5.063747406005859, "loss": 0.4248, "rewards/accuracies": 1.0, "rewards/chosen": 0.9982168078422546, "rewards/margins": 0.6361302137374878, "rewards/rejected": 0.36208659410476685, "step": 5123 }, { "epoch": 2.76, "learning_rate": 2.30289463619738e-08, "logits/chosen": -2.1691787242889404, "logits/rejected": -2.171855926513672, "logps/chosen": -0.18113689124584198, "logps/rejected": -6.926201343536377, "loss": 0.3972, "rewards/accuracies": 1.0, "rewards/chosen": 0.8595705032348633, "rewards/margins": 0.7180639505386353, "rewards/rejected": 0.14150653779506683, "step": 5124 }, { "epoch": 2.76, "learning_rate": 2.3010560678124723e-08, "logits/chosen": -2.219829797744751, "logits/rejected": -2.150489091873169, "logps/chosen": -20.64061164855957, "logps/rejected": -2.2013142108917236, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 2.36383318901062, "rewards/margins": 1.7579045295715332, "rewards/rejected": 0.6059285998344421, "step": 5125 }, { "epoch": 2.76, "learning_rate": 2.2992180142689866e-08, "logits/chosen": -2.0539774894714355, "logits/rejected": -2.0588016510009766, "logps/chosen": -2.4256045818328857, "logps/rejected": -3.456130027770996, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 1.2430294752120972, "rewards/margins": 0.8373123407363892, "rewards/rejected": 0.4057171046733856, "step": 5126 }, { "epoch": 2.77, "learning_rate": 2.297380475917542e-08, "logits/chosen": -2.1062047481536865, "logits/rejected": -2.2809057235717773, "logps/chosen": -4.970586776733398, "logps/rejected": -4.920457363128662, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.7093425989151001, "rewards/margins": 0.0321163535118103, "rewards/rejected": 0.6772262454032898, "step": 5127 }, { "epoch": 2.77, "learning_rate": 2.2955434531086626e-08, "logits/chosen": -2.1514480113983154, "logits/rejected": -2.260462522506714, "logps/chosen": -0.6265159845352173, "logps/rejected": -0.8595929145812988, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.96845543384552, "rewards/margins": 0.02757483720779419, "rewards/rejected": 0.9408805966377258, "step": 5128 }, { "epoch": 2.77, "learning_rate": 2.293706946192772e-08, "logits/chosen": -2.002408027648926, "logits/rejected": -2.317664384841919, "logps/chosen": -5.066728591918945, "logps/rejected": -7.284767150878906, "loss": 0.6653, "rewards/accuracies": 1.0, "rewards/chosen": 0.7663421630859375, "rewards/margins": 0.05643707513809204, "rewards/rejected": 0.7099050879478455, "step": 5129 }, { "epoch": 2.77, "learning_rate": 2.291870955520198e-08, "logits/chosen": -2.230363607406616, "logits/rejected": -2.3446950912475586, "logps/chosen": -0.22906872630119324, "logps/rejected": -0.23389309644699097, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 1.0003818273544312, "rewards/margins": 0.005119740962982178, "rewards/rejected": 0.995262086391449, "step": 5130 }, { "epoch": 2.77, "learning_rate": 2.2900354814411644e-08, "logits/chosen": -2.0502591133117676, "logits/rejected": -2.0440404415130615, "logps/chosen": -0.5035892724990845, "logps/rejected": -5.724905967712402, "loss": 0.4282, "rewards/accuracies": 1.0, "rewards/chosen": 1.0869373083114624, "rewards/margins": 0.6264435052871704, "rewards/rejected": 0.4604937732219696, "step": 5131 }, { "epoch": 2.77, "learning_rate": 2.2882005243058018e-08, "logits/chosen": -2.2088546752929688, "logits/rejected": -2.0681607723236084, "logps/chosen": -28.540027618408203, "logps/rejected": -5.933505535125732, "loss": 0.1347, "rewards/accuracies": 1.0, "rewards/chosen": 2.3479487895965576, "rewards/margins": 1.9365788698196411, "rewards/rejected": 0.4113699495792389, "step": 5132 }, { "epoch": 2.77, "learning_rate": 2.2863660844641386e-08, "logits/chosen": -2.0936362743377686, "logits/rejected": -2.2907609939575195, "logps/chosen": -0.3598783016204834, "logps/rejected": -0.4656561017036438, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.8666410446166992, "rewards/margins": -0.00374525785446167, "rewards/rejected": 0.8703863024711609, "step": 5133 }, { "epoch": 2.77, "learning_rate": 2.28453216226611e-08, "logits/chosen": -2.055464267730713, "logits/rejected": -2.2820608615875244, "logps/chosen": -1.3507206439971924, "logps/rejected": -46.52003860473633, "loss": 0.2422, "rewards/accuracies": 1.0, "rewards/chosen": 0.8727216720581055, "rewards/margins": 1.2946679592132568, "rewards/rejected": -0.42194634675979614, "step": 5134 }, { "epoch": 2.77, "learning_rate": 2.282698758061547e-08, "logits/chosen": -2.123624563217163, "logits/rejected": -2.087818145751953, "logps/chosen": -2.451530694961548, "logps/rejected": -4.595796585083008, "loss": 0.3869, "rewards/accuracies": 1.0, "rewards/chosen": 1.1957905292510986, "rewards/margins": 0.7497959136962891, "rewards/rejected": 0.4459945857524872, "step": 5135 }, { "epoch": 2.77, "learning_rate": 2.2808658722001838e-08, "logits/chosen": -2.0057148933410645, "logits/rejected": -2.2844200134277344, "logps/chosen": -2.043492317199707, "logps/rejected": -9.621185302734375, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.9258913993835449, "rewards/margins": 0.0030417442321777344, "rewards/rejected": 0.9228496551513672, "step": 5136 }, { "epoch": 2.77, "learning_rate": 2.279033505031655e-08, "logits/chosen": -2.002854347229004, "logits/rejected": -1.999157428741455, "logps/chosen": -2.682668924331665, "logps/rejected": -2.6000945568084717, "loss": 0.3647, "rewards/accuracies": 1.0, "rewards/chosen": 1.4985450506210327, "rewards/margins": 0.8209006190299988, "rewards/rejected": 0.6776444315910339, "step": 5137 }, { "epoch": 2.77, "learning_rate": 2.2772016569054974e-08, "logits/chosen": -2.0287699699401855, "logits/rejected": -2.307826042175293, "logps/chosen": -1.3321399688720703, "logps/rejected": -1.492068886756897, "loss": 0.7027, "rewards/accuracies": 0.0, "rewards/chosen": 1.1333907842636108, "rewards/margins": -0.019054293632507324, "rewards/rejected": 1.1524450778961182, "step": 5138 }, { "epoch": 2.77, "learning_rate": 2.2753703281711473e-08, "logits/chosen": -2.0034759044647217, "logits/rejected": -1.9786101579666138, "logps/chosen": -11.079124450683594, "logps/rejected": -7.526688098907471, "loss": 0.3053, "rewards/accuracies": 1.0, "rewards/chosen": 1.6964870691299438, "rewards/margins": 1.0300662517547607, "rewards/rejected": 0.6664208769798279, "step": 5139 }, { "epoch": 2.77, "learning_rate": 2.2735395191779438e-08, "logits/chosen": -2.0552268028259277, "logits/rejected": -2.0469894409179688, "logps/chosen": -5.0043044090271, "logps/rejected": -2.216053009033203, "loss": 0.3374, "rewards/accuracies": 1.0, "rewards/chosen": 1.740203857421875, "rewards/margins": 0.913172721862793, "rewards/rejected": 0.827031135559082, "step": 5140 }, { "epoch": 2.77, "learning_rate": 2.271709230275125e-08, "logits/chosen": -2.0588369369506836, "logits/rejected": -2.3302369117736816, "logps/chosen": -1.3890331983566284, "logps/rejected": -1.4213184118270874, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.8527188301086426, "rewards/margins": 0.02011328935623169, "rewards/rejected": 0.8326055407524109, "step": 5141 }, { "epoch": 2.77, "learning_rate": 2.269879461811831e-08, "logits/chosen": -2.096071481704712, "logits/rejected": -2.3242063522338867, "logps/chosen": -0.9537197947502136, "logps/rejected": -1.035092830657959, "loss": 0.6692, "rewards/accuracies": 1.0, "rewards/chosen": 0.8567226529121399, "rewards/margins": 0.04845082759857178, "rewards/rejected": 0.8082718253135681, "step": 5142 }, { "epoch": 2.77, "learning_rate": 2.2680502141371023e-08, "logits/chosen": -2.138124465942383, "logits/rejected": -2.138972282409668, "logps/chosen": -0.8116971850395203, "logps/rejected": -4.848720550537109, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 0.9365183711051941, "rewards/margins": 0.4241476058959961, "rewards/rejected": 0.512370765209198, "step": 5143 }, { "epoch": 2.77, "learning_rate": 2.2662214875998797e-08, "logits/chosen": -2.1485302448272705, "logits/rejected": -2.1477322578430176, "logps/chosen": -1.017539381980896, "logps/rejected": -5.180729866027832, "loss": 0.3895, "rewards/accuracies": 1.0, "rewards/chosen": 1.1310155391693115, "rewards/margins": 0.7417194843292236, "rewards/rejected": 0.3892960548400879, "step": 5144 }, { "epoch": 2.78, "learning_rate": 2.2643932825490053e-08, "logits/chosen": -2.1406006813049316, "logits/rejected": -2.340027093887329, "logps/chosen": -0.3566851317882538, "logps/rejected": -0.39506953954696655, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.8356478810310364, "rewards/margins": 0.008324623107910156, "rewards/rejected": 0.8273232579231262, "step": 5145 }, { "epoch": 2.78, "learning_rate": 2.262565599333221e-08, "logits/chosen": -2.1521549224853516, "logits/rejected": -2.3138961791992188, "logps/chosen": -1.6199779510498047, "logps/rejected": -1.7413530349731445, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": 0.6491739153862, "rewards/margins": 0.03892582654953003, "rewards/rejected": 0.6102480888366699, "step": 5146 }, { "epoch": 2.78, "learning_rate": 2.2607384383011697e-08, "logits/chosen": -2.187487840652466, "logits/rejected": -2.1370785236358643, "logps/chosen": -15.625943183898926, "logps/rejected": -3.6547954082489014, "loss": 0.264, "rewards/accuracies": 1.0, "rewards/chosen": 1.7667607069015503, "rewards/margins": 1.1968128681182861, "rewards/rejected": 0.5699478983879089, "step": 5147 }, { "epoch": 2.78, "learning_rate": 2.258911799801394e-08, "logits/chosen": -1.9678006172180176, "logits/rejected": -1.9802765846252441, "logps/chosen": -1.2706459760665894, "logps/rejected": -7.269747734069824, "loss": 0.4144, "rewards/accuracies": 1.0, "rewards/chosen": 1.2334741353988647, "rewards/margins": 0.6665663123130798, "rewards/rejected": 0.5669078230857849, "step": 5148 }, { "epoch": 2.78, "learning_rate": 2.257085684182338e-08, "logits/chosen": -2.075456142425537, "logits/rejected": -2.278796911239624, "logps/chosen": -0.46881383657455444, "logps/rejected": -0.44790422916412354, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.8826306462287903, "rewards/margins": 0.014761805534362793, "rewards/rejected": 0.8678688406944275, "step": 5149 }, { "epoch": 2.78, "learning_rate": 2.255260091792345e-08, "logits/chosen": -2.067972183227539, "logits/rejected": -2.0596354007720947, "logps/chosen": -7.734655857086182, "logps/rejected": -4.523924350738525, "loss": 0.2989, "rewards/accuracies": 1.0, "rewards/chosen": 1.5910078287124634, "rewards/margins": 1.0544098615646362, "rewards/rejected": 0.5365979671478271, "step": 5150 }, { "epoch": 2.78, "learning_rate": 2.2534350229796585e-08, "logits/chosen": -2.020796775817871, "logits/rejected": -2.2542717456817627, "logps/chosen": -0.33110639452934265, "logps/rejected": -0.3906455636024475, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 1.0198341608047485, "rewards/margins": 0.012854337692260742, "rewards/rejected": 1.0069798231124878, "step": 5151 }, { "epoch": 2.78, "learning_rate": 2.2516104780924227e-08, "logits/chosen": -2.091479539871216, "logits/rejected": -2.0973479747772217, "logps/chosen": -1.4121427536010742, "logps/rejected": -2.9642693996429443, "loss": 0.4484, "rewards/accuracies": 1.0, "rewards/chosen": 1.1893175840377808, "rewards/margins": 0.5696123838424683, "rewards/rejected": 0.6197052001953125, "step": 5152 }, { "epoch": 2.78, "learning_rate": 2.2497864574786817e-08, "logits/chosen": -2.0745980739593506, "logits/rejected": -2.076141834259033, "logps/chosen": -1.1147369146347046, "logps/rejected": -5.233304023742676, "loss": 0.5223, "rewards/accuracies": 1.0, "rewards/chosen": 0.906125545501709, "rewards/margins": 0.37706422805786133, "rewards/rejected": 0.5290613174438477, "step": 5153 }, { "epoch": 2.78, "learning_rate": 2.2479629614863798e-08, "logits/chosen": -1.9309817552566528, "logits/rejected": -2.2675633430480957, "logps/chosen": -4.828010559082031, "logps/rejected": -4.023481845855713, "loss": 0.6505, "rewards/accuracies": 1.0, "rewards/chosen": 0.9264801144599915, "rewards/margins": 0.08712959289550781, "rewards/rejected": 0.8393505215644836, "step": 5154 }, { "epoch": 2.78, "learning_rate": 2.2461399904633586e-08, "logits/chosen": -2.0113000869750977, "logits/rejected": -2.246859312057495, "logps/chosen": -0.13879986107349396, "logps/rejected": -0.15445579588413239, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.9975407719612122, "rewards/margins": -0.008397877216339111, "rewards/rejected": 1.0059386491775513, "step": 5155 }, { "epoch": 2.78, "learning_rate": 2.244317544757366e-08, "logits/chosen": -2.0865368843078613, "logits/rejected": -2.0829482078552246, "logps/chosen": -13.46133041381836, "logps/rejected": -11.710655212402344, "loss": 0.7275, "rewards/accuracies": 0.0, "rewards/chosen": 0.853489100933075, "rewards/margins": -0.06752991676330566, "rewards/rejected": 0.9210190176963806, "step": 5156 }, { "epoch": 2.78, "learning_rate": 2.2424956247160432e-08, "logits/chosen": -2.1186506748199463, "logits/rejected": -1.9963738918304443, "logps/chosen": -28.462505340576172, "logps/rejected": -4.397202014923096, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": 2.46879506111145, "rewards/margins": 2.0726873874664307, "rewards/rejected": 0.3961077332496643, "step": 5157 }, { "epoch": 2.78, "learning_rate": 2.2406742306869337e-08, "logits/chosen": -2.0541889667510986, "logits/rejected": -2.0545742511749268, "logps/chosen": -3.5756468772888184, "logps/rejected": -4.487166881561279, "loss": 0.4894, "rewards/accuracies": 1.0, "rewards/chosen": 1.1321293115615845, "rewards/margins": 0.45993274450302124, "rewards/rejected": 0.6721965670585632, "step": 5158 }, { "epoch": 2.78, "learning_rate": 2.2388533630174804e-08, "logits/chosen": -2.01332950592041, "logits/rejected": -2.30114483833313, "logps/chosen": -0.3936871588230133, "logps/rejected": -0.43210679292678833, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 1.006059169769287, "rewards/margins": 0.0065138936042785645, "rewards/rejected": 0.9995452761650085, "step": 5159 }, { "epoch": 2.78, "learning_rate": 2.237033022055028e-08, "logits/chosen": -2.0099546909332275, "logits/rejected": -2.0108585357666016, "logps/chosen": -0.13828226923942566, "logps/rejected": -7.108068466186523, "loss": 0.3939, "rewards/accuracies": 1.0, "rewards/chosen": 0.9523952603340149, "rewards/margins": 0.7281730771064758, "rewards/rejected": 0.22422218322753906, "step": 5160 }, { "epoch": 2.78, "learning_rate": 2.2352132081468155e-08, "logits/chosen": -2.0272839069366455, "logits/rejected": -2.0214810371398926, "logps/chosen": -1.5347869396209717, "logps/rejected": -4.686092853546143, "loss": 0.4215, "rewards/accuracies": 1.0, "rewards/chosen": 1.2161163091659546, "rewards/margins": 0.6457992792129517, "rewards/rejected": 0.5703170299530029, "step": 5161 }, { "epoch": 2.78, "learning_rate": 2.233393921639985e-08, "logits/chosen": -2.060713052749634, "logits/rejected": -2.266657590866089, "logps/chosen": -1.6622291803359985, "logps/rejected": -1.8558286428451538, "loss": 0.558, "rewards/accuracies": 1.0, "rewards/chosen": 1.1391137838363647, "rewards/margins": 0.29139649868011475, "rewards/rejected": 0.84771728515625, "step": 5162 }, { "epoch": 2.78, "learning_rate": 2.2315751628815776e-08, "logits/chosen": -2.1474087238311768, "logits/rejected": -2.140254020690918, "logps/chosen": -5.080039978027344, "logps/rejected": -4.7321319580078125, "loss": 0.3691, "rewards/accuracies": 1.0, "rewards/chosen": 1.3075560331344604, "rewards/margins": 0.8063803315162659, "rewards/rejected": 0.5011757016181946, "step": 5163 }, { "epoch": 2.79, "learning_rate": 2.2297569322185344e-08, "logits/chosen": -2.0886781215667725, "logits/rejected": -2.089794397354126, "logps/chosen": -2.444268226623535, "logps/rejected": -2.6910054683685303, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 1.7915595769882202, "rewards/margins": 0.8894688487052917, "rewards/rejected": 0.9020907282829285, "step": 5164 }, { "epoch": 2.79, "learning_rate": 2.2279392299976946e-08, "logits/chosen": -2.254894971847534, "logits/rejected": -2.0607047080993652, "logps/chosen": -40.28111267089844, "logps/rejected": -4.065544128417969, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": 3.076488494873047, "rewards/margins": 2.579702377319336, "rewards/rejected": 0.4967862069606781, "step": 5165 }, { "epoch": 2.79, "learning_rate": 2.2261220565657966e-08, "logits/chosen": -2.254704475402832, "logits/rejected": -2.2167882919311523, "logps/chosen": -20.7298583984375, "logps/rejected": -23.22726058959961, "loss": 0.4983, "rewards/accuracies": 1.0, "rewards/chosen": 1.8420814275741577, "rewards/margins": 0.43699562549591064, "rewards/rejected": 1.405085802078247, "step": 5166 }, { "epoch": 2.79, "learning_rate": 2.224305412269477e-08, "logits/chosen": -2.2141809463500977, "logits/rejected": -2.314697742462158, "logps/chosen": -7.863848686218262, "logps/rejected": -5.591439723968506, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": 0.8864871263504028, "rewards/margins": 0.0656401515007019, "rewards/rejected": 0.8208469748497009, "step": 5167 }, { "epoch": 2.79, "learning_rate": 2.2224892974552756e-08, "logits/chosen": -1.9541010856628418, "logits/rejected": -1.9544432163238525, "logps/chosen": -1.4276514053344727, "logps/rejected": -6.216399192810059, "loss": 0.4952, "rewards/accuracies": 1.0, "rewards/chosen": 1.0768091678619385, "rewards/margins": 0.44502347707748413, "rewards/rejected": 0.6317856907844543, "step": 5168 }, { "epoch": 2.79, "learning_rate": 2.2206737124696273e-08, "logits/chosen": -1.9702894687652588, "logits/rejected": -2.2709639072418213, "logps/chosen": -6.667118072509766, "logps/rejected": -1.2291654348373413, "loss": 0.5489, "rewards/accuracies": 1.0, "rewards/chosen": 1.2302073240280151, "rewards/margins": 0.3128306269645691, "rewards/rejected": 0.917376697063446, "step": 5169 }, { "epoch": 2.79, "learning_rate": 2.2188586576588663e-08, "logits/chosen": -2.106584310531616, "logits/rejected": -2.0955588817596436, "logps/chosen": -1.2707891464233398, "logps/rejected": -13.194828987121582, "loss": 0.2736, "rewards/accuracies": 1.0, "rewards/chosen": 1.444563627243042, "rewards/margins": 1.1563259363174438, "rewards/rejected": 0.28823766112327576, "step": 5170 }, { "epoch": 2.79, "learning_rate": 2.217044133369227e-08, "logits/chosen": -2.055286169052124, "logits/rejected": -2.291273593902588, "logps/chosen": -0.6618363857269287, "logps/rejected": -0.6671741008758545, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.927330493927002, "rewards/margins": 0.030739963054656982, "rewards/rejected": 0.896590530872345, "step": 5171 }, { "epoch": 2.79, "learning_rate": 2.2152301399468416e-08, "logits/chosen": -2.089137315750122, "logits/rejected": -2.3260087966918945, "logps/chosen": -11.992679595947266, "logps/rejected": -11.058979034423828, "loss": 0.5761, "rewards/accuracies": 1.0, "rewards/chosen": 1.3068478107452393, "rewards/margins": 0.24970853328704834, "rewards/rejected": 1.057139277458191, "step": 5172 }, { "epoch": 2.79, "learning_rate": 2.2134166777377406e-08, "logits/chosen": -2.1728005409240723, "logits/rejected": -2.379096031188965, "logps/chosen": -1.0095136165618896, "logps/rejected": -0.8749166131019592, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 0.8643956184387207, "rewards/margins": -0.01601552963256836, "rewards/rejected": 0.8804111480712891, "step": 5173 }, { "epoch": 2.79, "learning_rate": 2.2116037470878552e-08, "logits/chosen": -2.1074090003967285, "logits/rejected": -2.112406015396118, "logps/chosen": -3.2226693630218506, "logps/rejected": -4.272374153137207, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 1.1708645820617676, "rewards/margins": 0.4704793691635132, "rewards/rejected": 0.7003852128982544, "step": 5174 }, { "epoch": 2.79, "learning_rate": 2.209791348343013e-08, "logits/chosen": -2.066669225692749, "logits/rejected": -2.2791941165924072, "logps/chosen": -4.4775495529174805, "logps/rejected": -4.650941848754883, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 0.6925263404846191, "rewards/margins": 0.04965752363204956, "rewards/rejected": 0.6428688168525696, "step": 5175 }, { "epoch": 2.79, "learning_rate": 2.2079794818489416e-08, "logits/chosen": -2.0010335445404053, "logits/rejected": -2.288738250732422, "logps/chosen": -2.4064748287200928, "logps/rejected": -5.043266773223877, "loss": 0.5992, "rewards/accuracies": 1.0, "rewards/chosen": 0.8306726813316345, "rewards/margins": 0.19754743576049805, "rewards/rejected": 0.6331252455711365, "step": 5176 }, { "epoch": 2.79, "learning_rate": 2.2061681479512684e-08, "logits/chosen": -2.12074613571167, "logits/rejected": -2.1273720264434814, "logps/chosen": -1.2822389602661133, "logps/rejected": -4.4845075607299805, "loss": 0.4092, "rewards/accuracies": 1.0, "rewards/chosen": 1.1005905866622925, "rewards/margins": 0.6819434762001038, "rewards/rejected": 0.4186471104621887, "step": 5177 }, { "epoch": 2.79, "learning_rate": 2.2043573469955113e-08, "logits/chosen": -2.3015730381011963, "logits/rejected": -2.319547414779663, "logps/chosen": -0.4664645791053772, "logps/rejected": -0.49145302176475525, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.9473051428794861, "rewards/margins": -0.0029633045196533203, "rewards/rejected": 0.9502684473991394, "step": 5178 }, { "epoch": 2.79, "learning_rate": 2.2025470793270983e-08, "logits/chosen": -2.0682342052459717, "logits/rejected": -2.2607595920562744, "logps/chosen": -3.3300585746765137, "logps/rejected": -3.451151132583618, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.7461468577384949, "rewards/margins": -0.004340946674346924, "rewards/rejected": 0.7504878044128418, "step": 5179 }, { "epoch": 2.79, "learning_rate": 2.2007373452913486e-08, "logits/chosen": -2.141622543334961, "logits/rejected": -2.140165090560913, "logps/chosen": -5.377553462982178, "logps/rejected": -3.2574150562286377, "loss": 0.3687, "rewards/accuracies": 1.0, "rewards/chosen": 1.4661760330200195, "rewards/margins": 0.8076582551002502, "rewards/rejected": 0.6585177779197693, "step": 5180 }, { "epoch": 2.79, "learning_rate": 2.19892814523348e-08, "logits/chosen": -2.094027042388916, "logits/rejected": -2.2878239154815674, "logps/chosen": -0.8251909613609314, "logps/rejected": -0.8524401187896729, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.9244012832641602, "rewards/margins": 0.03006923198699951, "rewards/rejected": 0.8943320512771606, "step": 5181 }, { "epoch": 2.8, "learning_rate": 2.1971194794986102e-08, "logits/chosen": -2.0686004161834717, "logits/rejected": -2.252794027328491, "logps/chosen": -0.2173638790845871, "logps/rejected": -0.24511763453483582, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8317907452583313, "rewards/margins": 0.025940120220184326, "rewards/rejected": 0.805850625038147, "step": 5182 }, { "epoch": 2.8, "learning_rate": 2.1953113484317542e-08, "logits/chosen": -2.0328807830810547, "logits/rejected": -2.2872767448425293, "logps/chosen": -0.5825854539871216, "logps/rejected": -0.6672925353050232, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 1.0131796598434448, "rewards/margins": -4.184246063232422e-05, "rewards/rejected": 1.0132215023040771, "step": 5183 }, { "epoch": 2.8, "learning_rate": 2.1935037523778238e-08, "logits/chosen": -2.263944625854492, "logits/rejected": -2.142275810241699, "logps/chosen": -31.72583770751953, "logps/rejected": -5.4344940185546875, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": 2.846623659133911, "rewards/margins": 2.14042329788208, "rewards/rejected": 0.7062002420425415, "step": 5184 }, { "epoch": 2.8, "learning_rate": 2.1916966916816315e-08, "logits/chosen": -2.183793783187866, "logits/rejected": -2.066884994506836, "logps/chosen": -25.865724563598633, "logps/rejected": -3.8908839225769043, "loss": 0.1965, "rewards/accuracies": 1.0, "rewards/chosen": 1.9909887313842773, "rewards/margins": 1.5272774696350098, "rewards/rejected": 0.4637112319469452, "step": 5185 }, { "epoch": 2.8, "learning_rate": 2.189890166687885e-08, "logits/chosen": -2.108501672744751, "logits/rejected": -2.3266499042510986, "logps/chosen": -8.304845809936523, "logps/rejected": -9.473855972290039, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 1.0499706268310547, "rewards/margins": 0.0020824670791625977, "rewards/rejected": 1.047888159751892, "step": 5186 }, { "epoch": 2.8, "learning_rate": 2.1880841777411913e-08, "logits/chosen": -2.0933923721313477, "logits/rejected": -2.2865285873413086, "logps/chosen": -2.118596076965332, "logps/rejected": -2.090440034866333, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.9956558346748352, "rewards/margins": 0.00046837329864501953, "rewards/rejected": 0.9951874613761902, "step": 5187 }, { "epoch": 2.8, "learning_rate": 2.186278725186055e-08, "logits/chosen": -2.175109386444092, "logits/rejected": -2.1785707473754883, "logps/chosen": -0.2737722098827362, "logps/rejected": -7.688136100769043, "loss": 0.4278, "rewards/accuracies": 1.0, "rewards/chosen": 1.0094279050827026, "rewards/margins": 0.6275420784950256, "rewards/rejected": 0.381885826587677, "step": 5188 }, { "epoch": 2.8, "learning_rate": 2.1844738093668775e-08, "logits/chosen": -2.1293997764587402, "logits/rejected": -2.139268636703491, "logps/chosen": -1.6169087886810303, "logps/rejected": -2.1823689937591553, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 1.1903908252716064, "rewards/margins": 0.4616876244544983, "rewards/rejected": 0.7287032008171082, "step": 5189 }, { "epoch": 2.8, "learning_rate": 2.1826694306279586e-08, "logits/chosen": -2.082784414291382, "logits/rejected": -2.260103225708008, "logps/chosen": -0.2894495725631714, "logps/rejected": -0.2747054994106293, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.9053789973258972, "rewards/margins": 0.03931379318237305, "rewards/rejected": 0.8660652041435242, "step": 5190 }, { "epoch": 2.8, "learning_rate": 2.180865589313495e-08, "logits/chosen": -2.0825891494750977, "logits/rejected": -2.2779338359832764, "logps/chosen": -0.36596041917800903, "logps/rejected": -0.430932879447937, "loss": 0.7043, "rewards/accuracies": 0.0, "rewards/chosen": 1.0392619371414185, "rewards/margins": -0.022162675857543945, "rewards/rejected": 1.0614246129989624, "step": 5191 }, { "epoch": 2.8, "learning_rate": 2.1790622857675832e-08, "logits/chosen": -2.04386043548584, "logits/rejected": -2.338588237762451, "logps/chosen": -0.4854658246040344, "logps/rejected": -11.81514835357666, "loss": 0.662, "rewards/accuracies": 1.0, "rewards/chosen": 0.9377841949462891, "rewards/margins": 0.06338012218475342, "rewards/rejected": 0.8744040727615356, "step": 5192 }, { "epoch": 2.8, "learning_rate": 2.1772595203342136e-08, "logits/chosen": -2.1393864154815674, "logits/rejected": -2.2689833641052246, "logps/chosen": -0.1067889854311943, "logps/rejected": -0.10468773543834686, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.8340721130371094, "rewards/margins": 0.005333900451660156, "rewards/rejected": 0.8287382125854492, "step": 5193 }, { "epoch": 2.8, "learning_rate": 2.175457293357277e-08, "logits/chosen": -2.0373127460479736, "logits/rejected": -2.0458922386169434, "logps/chosen": -1.2402622699737549, "logps/rejected": -4.408230304718018, "loss": 0.428, "rewards/accuracies": 1.0, "rewards/chosen": 1.0232146978378296, "rewards/margins": 0.6269567608833313, "rewards/rejected": 0.3962579369544983, "step": 5194 }, { "epoch": 2.8, "learning_rate": 2.1736556051805583e-08, "logits/chosen": -2.0847744941711426, "logits/rejected": -2.0728983879089355, "logps/chosen": -7.034028053283691, "logps/rejected": -0.6555047035217285, "loss": 0.4585, "rewards/accuracies": 1.0, "rewards/chosen": 1.581786870956421, "rewards/margins": 0.5418105125427246, "rewards/rejected": 1.0399763584136963, "step": 5195 }, { "epoch": 2.8, "learning_rate": 2.1718544561477426e-08, "logits/chosen": -2.076627254486084, "logits/rejected": -2.325087547302246, "logps/chosen": -1.4723339080810547, "logps/rejected": -1.4944387674331665, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.6942057609558105, "rewards/margins": 0.030401289463043213, "rewards/rejected": 0.6638044714927673, "step": 5196 }, { "epoch": 2.8, "learning_rate": 2.1700538466024103e-08, "logits/chosen": -2.1266448497772217, "logits/rejected": -2.1433022022247314, "logps/chosen": -7.147156238555908, "logps/rejected": -4.635939121246338, "loss": 0.403, "rewards/accuracies": 1.0, "rewards/chosen": 1.3507503271102905, "rewards/margins": 0.7006425857543945, "rewards/rejected": 0.650107741355896, "step": 5197 }, { "epoch": 2.8, "learning_rate": 2.168253776888041e-08, "logits/chosen": -2.0701801776885986, "logits/rejected": -2.342592716217041, "logps/chosen": -0.29115626215934753, "logps/rejected": -0.27641618251800537, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.9988641738891602, "rewards/margins": 0.01106041669845581, "rewards/rejected": 0.9878037571907043, "step": 5198 }, { "epoch": 2.8, "learning_rate": 2.1664542473480084e-08, "logits/chosen": -2.1691489219665527, "logits/rejected": -2.137540102005005, "logps/chosen": -24.95812225341797, "logps/rejected": -5.444129943847656, "loss": 0.4816, "rewards/accuracies": 1.0, "rewards/chosen": 1.5177268981933594, "rewards/margins": 0.48025035858154297, "rewards/rejected": 1.0374765396118164, "step": 5199 }, { "epoch": 2.8, "learning_rate": 2.1646552583255838e-08, "logits/chosen": -2.137899160385132, "logits/rejected": -2.0939106941223145, "logps/chosen": -18.793222427368164, "logps/rejected": -9.182918548583984, "loss": 0.3319, "rewards/accuracies": 1.0, "rewards/chosen": 1.6224416494369507, "rewards/margins": 0.9322213530540466, "rewards/rejected": 0.690220296382904, "step": 5200 }, { "epoch": 2.81, "learning_rate": 2.16285681016394e-08, "logits/chosen": -2.131434679031372, "logits/rejected": -2.06109619140625, "logps/chosen": -9.717406272888184, "logps/rejected": -7.72086238861084, "loss": 0.6539, "rewards/accuracies": 1.0, "rewards/chosen": 0.7385266423225403, "rewards/margins": 0.08002245426177979, "rewards/rejected": 0.6585041880607605, "step": 5201 }, { "epoch": 2.81, "learning_rate": 2.16105890320614e-08, "logits/chosen": -2.0977225303649902, "logits/rejected": -2.308603286743164, "logps/chosen": -3.2106494903564453, "logps/rejected": -1.0952870845794678, "loss": 0.7067, "rewards/accuracies": 0.0, "rewards/chosen": 0.7215785980224609, "rewards/margins": -0.02692335844039917, "rewards/rejected": 0.7485019564628601, "step": 5202 }, { "epoch": 2.81, "learning_rate": 2.1592615377951474e-08, "logits/chosen": -2.225339412689209, "logits/rejected": -2.3465216159820557, "logps/chosen": -11.727252960205078, "logps/rejected": -9.551316261291504, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.8606447577476501, "rewards/margins": 0.0321580171585083, "rewards/rejected": 0.8284867405891418, "step": 5203 }, { "epoch": 2.81, "learning_rate": 2.157464714273821e-08, "logits/chosen": -2.081725835800171, "logits/rejected": -2.3014440536499023, "logps/chosen": -1.7509034872055054, "logps/rejected": -1.6754060983657837, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.9293921589851379, "rewards/margins": 0.003106415271759033, "rewards/rejected": 0.9262857437133789, "step": 5204 }, { "epoch": 2.81, "learning_rate": 2.1556684329849172e-08, "logits/chosen": -1.9502291679382324, "logits/rejected": -1.9593639373779297, "logps/chosen": -0.951938271522522, "logps/rejected": -3.1378514766693115, "loss": 0.4521, "rewards/accuracies": 1.0, "rewards/chosen": 1.1094944477081299, "rewards/margins": 0.5592740178108215, "rewards/rejected": 0.5502204298973083, "step": 5205 }, { "epoch": 2.81, "learning_rate": 2.15387269427109e-08, "logits/chosen": -2.089276075363159, "logits/rejected": -2.096511125564575, "logps/chosen": -1.4869261980056763, "logps/rejected": -4.194465160369873, "loss": 0.3978, "rewards/accuracies": 1.0, "rewards/chosen": 1.1530932188034058, "rewards/margins": 0.716362476348877, "rewards/rejected": 0.4367307126522064, "step": 5206 }, { "epoch": 2.81, "learning_rate": 2.1520774984748853e-08, "logits/chosen": -2.0017759799957275, "logits/rejected": -2.2877790927886963, "logps/chosen": -0.25985774397850037, "logps/rejected": -0.3147090673446655, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.9751222729682922, "rewards/margins": 0.0017381906509399414, "rewards/rejected": 0.9733840823173523, "step": 5207 }, { "epoch": 2.81, "learning_rate": 2.15028284593875e-08, "logits/chosen": -2.1043388843536377, "logits/rejected": -2.1091253757476807, "logps/chosen": -8.81542682647705, "logps/rejected": -3.0302369594573975, "loss": 0.3297, "rewards/accuracies": 1.0, "rewards/chosen": 1.5175613164901733, "rewards/margins": 0.9402026534080505, "rewards/rejected": 0.5773586630821228, "step": 5208 }, { "epoch": 2.81, "learning_rate": 2.148488737005026e-08, "logits/chosen": -2.1897900104522705, "logits/rejected": -2.1835522651672363, "logps/chosen": -6.028700351715088, "logps/rejected": -5.4486541748046875, "loss": 0.3498, "rewards/accuracies": 1.0, "rewards/chosen": 1.3191293478012085, "rewards/margins": 0.8702989816665649, "rewards/rejected": 0.44883033633232117, "step": 5209 }, { "epoch": 2.81, "learning_rate": 2.1466951720159516e-08, "logits/chosen": -2.1107122898101807, "logits/rejected": -2.2962100505828857, "logps/chosen": -0.5659012794494629, "logps/rejected": -0.6320945024490356, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.7120949625968933, "rewards/margins": 0.022023677825927734, "rewards/rejected": 0.6900712847709656, "step": 5210 }, { "epoch": 2.81, "learning_rate": 2.1449021513136595e-08, "logits/chosen": -2.312690258026123, "logits/rejected": -2.2541587352752686, "logps/chosen": -7.7172956466674805, "logps/rejected": -7.719447612762451, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 0.45850488543510437, "rewards/margins": 0.03292366862297058, "rewards/rejected": 0.4255812168121338, "step": 5211 }, { "epoch": 2.81, "learning_rate": 2.1431096752401833e-08, "logits/chosen": -2.221733570098877, "logits/rejected": -2.08598256111145, "logps/chosen": -50.85205841064453, "logps/rejected": -5.392793655395508, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": 2.8060944080352783, "rewards/margins": 2.1813735961914062, "rewards/rejected": 0.6247206926345825, "step": 5212 }, { "epoch": 2.81, "learning_rate": 2.1413177441374487e-08, "logits/chosen": -2.1278109550476074, "logits/rejected": -2.2847061157226562, "logps/chosen": -0.10768016427755356, "logps/rejected": -0.12135928869247437, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9527928233146667, "rewards/margins": 0.03045368194580078, "rewards/rejected": 0.922339141368866, "step": 5213 }, { "epoch": 2.81, "learning_rate": 2.139526358347277e-08, "logits/chosen": -2.2283291816711426, "logits/rejected": -2.114710569381714, "logps/chosen": -15.172419548034668, "logps/rejected": -12.558601379394531, "loss": 0.4375, "rewards/accuracies": 1.0, "rewards/chosen": 1.776183009147644, "rewards/margins": 0.5998591184616089, "rewards/rejected": 1.1763238906860352, "step": 5214 }, { "epoch": 2.81, "learning_rate": 2.1377355182113894e-08, "logits/chosen": -2.0770533084869385, "logits/rejected": -2.0756044387817383, "logps/chosen": -0.6665021777153015, "logps/rejected": -6.539881229400635, "loss": 0.3814, "rewards/accuracies": 1.0, "rewards/chosen": 1.1301770210266113, "rewards/margins": 0.7671231031417847, "rewards/rejected": 0.36305394768714905, "step": 5215 }, { "epoch": 2.81, "learning_rate": 2.1359452240713977e-08, "logits/chosen": -2.1465604305267334, "logits/rejected": -2.2905123233795166, "logps/chosen": -2.74267315864563, "logps/rejected": -2.666391372680664, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 1.0374298095703125, "rewards/margins": 0.0007486343383789062, "rewards/rejected": 1.0366811752319336, "step": 5216 }, { "epoch": 2.81, "learning_rate": 2.134155476268814e-08, "logits/chosen": -2.2402894496917725, "logits/rejected": -2.2348687648773193, "logps/chosen": -0.7129566669464111, "logps/rejected": -5.28821325302124, "loss": 0.4423, "rewards/accuracies": 1.0, "rewards/chosen": 1.0909743309020996, "rewards/margins": 0.586414098739624, "rewards/rejected": 0.5045602321624756, "step": 5217 }, { "epoch": 2.81, "learning_rate": 2.1323662751450445e-08, "logits/chosen": -2.1091878414154053, "logits/rejected": -2.2680256366729736, "logps/chosen": -0.838564932346344, "logps/rejected": -0.821994423866272, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.7760385870933533, "rewards/margins": 0.012867450714111328, "rewards/rejected": 0.7631711363792419, "step": 5218 }, { "epoch": 2.81, "learning_rate": 2.1305776210413907e-08, "logits/chosen": -2.13582706451416, "logits/rejected": -2.305813789367676, "logps/chosen": -4.921392440795898, "logps/rejected": -6.243572235107422, "loss": 0.5631, "rewards/accuracies": 1.0, "rewards/chosen": 1.0801175832748413, "rewards/margins": 0.27957987785339355, "rewards/rejected": 0.8005377054214478, "step": 5219 }, { "epoch": 2.82, "learning_rate": 2.1287895142990502e-08, "logits/chosen": -2.1007192134857178, "logits/rejected": -2.311610460281372, "logps/chosen": -1.3028111457824707, "logps/rejected": -1.619428038597107, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.983306884765625, "rewards/margins": 0.02190333604812622, "rewards/rejected": 0.9614035487174988, "step": 5220 }, { "epoch": 2.82, "learning_rate": 2.1270019552591166e-08, "logits/chosen": -2.171721935272217, "logits/rejected": -2.3549911975860596, "logps/chosen": -3.1500487327575684, "logps/rejected": -3.02774715423584, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 0.7886426448822021, "rewards/margins": -0.007161200046539307, "rewards/rejected": 0.7958038449287415, "step": 5221 }, { "epoch": 2.82, "learning_rate": 2.1252149442625784e-08, "logits/chosen": -2.1383986473083496, "logits/rejected": -2.1689445972442627, "logps/chosen": -9.602251052856445, "logps/rejected": -13.977080345153809, "loss": 0.643, "rewards/accuracies": 1.0, "rewards/chosen": 1.1952637434005737, "rewards/margins": 0.10300648212432861, "rewards/rejected": 1.0922572612762451, "step": 5222 }, { "epoch": 2.82, "learning_rate": 2.1234284816503194e-08, "logits/chosen": -2.0455715656280518, "logits/rejected": -2.357614755630493, "logps/chosen": -0.29981306195259094, "logps/rejected": -0.23412877321243286, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.8604151606559753, "rewards/margins": -0.007887780666351318, "rewards/rejected": 0.8683029413223267, "step": 5223 }, { "epoch": 2.82, "learning_rate": 2.1216425677631195e-08, "logits/chosen": -2.1280908584594727, "logits/rejected": -2.274125814437866, "logps/chosen": -0.17101651430130005, "logps/rejected": -0.18029135465621948, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.953005313873291, "rewards/margins": 0.006127536296844482, "rewards/rejected": 0.9468777775764465, "step": 5224 }, { "epoch": 2.82, "learning_rate": 2.119857202941654e-08, "logits/chosen": -2.0075435638427734, "logits/rejected": -2.0066769123077393, "logps/chosen": -4.649522304534912, "logps/rejected": -0.4993779957294464, "loss": 0.7321, "rewards/accuracies": 0.0, "rewards/chosen": 0.8718013763427734, "rewards/margins": -0.07645606994628906, "rewards/rejected": 0.9482574462890625, "step": 5225 }, { "epoch": 2.82, "learning_rate": 2.1180723875264927e-08, "logits/chosen": -2.0331387519836426, "logits/rejected": -2.275933265686035, "logps/chosen": -4.708430767059326, "logps/rejected": -1.2284260988235474, "loss": 0.6996, "rewards/accuracies": 0.0, "rewards/chosen": 0.8911375403404236, "rewards/margins": -0.012859702110290527, "rewards/rejected": 0.9039972424507141, "step": 5226 }, { "epoch": 2.82, "learning_rate": 2.1162881218581002e-08, "logits/chosen": -2.1774604320526123, "logits/rejected": -2.0341856479644775, "logps/chosen": -29.780471801757812, "logps/rejected": -5.74351167678833, "loss": 0.1959, "rewards/accuracies": 1.0, "rewards/chosen": 1.880531668663025, "rewards/margins": 1.5303882360458374, "rewards/rejected": 0.3501434028148651, "step": 5227 }, { "epoch": 2.82, "learning_rate": 2.1145044062768386e-08, "logits/chosen": -2.043102264404297, "logits/rejected": -2.041250705718994, "logps/chosen": -0.25453561544418335, "logps/rejected": -6.094791889190674, "loss": 0.4289, "rewards/accuracies": 1.0, "rewards/chosen": 0.9629184603691101, "rewards/margins": 0.6243538856506348, "rewards/rejected": 0.33856454491615295, "step": 5228 }, { "epoch": 2.82, "learning_rate": 2.112721241122962e-08, "logits/chosen": -2.1705219745635986, "logits/rejected": -2.3954694271087646, "logps/chosen": -0.5082548260688782, "logps/rejected": -0.549536943435669, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.9665800333023071, "rewards/margins": 0.016794800758361816, "rewards/rejected": 0.9497852325439453, "step": 5229 }, { "epoch": 2.82, "learning_rate": 2.110938626736622e-08, "logits/chosen": -2.0374622344970703, "logits/rejected": -2.255509853363037, "logps/chosen": -8.277568817138672, "logps/rejected": -5.161838531494141, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 0.9309296011924744, "rewards/margins": 0.04786515235900879, "rewards/rejected": 0.8830644488334656, "step": 5230 }, { "epoch": 2.82, "learning_rate": 2.1091565634578633e-08, "logits/chosen": -2.190441370010376, "logits/rejected": -2.3208789825439453, "logps/chosen": -7.078762054443359, "logps/rejected": -6.722479820251465, "loss": 0.7141, "rewards/accuracies": 0.0, "rewards/chosen": 0.4945342242717743, "rewards/margins": -0.041431814432144165, "rewards/rejected": 0.5359660387039185, "step": 5231 }, { "epoch": 2.82, "learning_rate": 2.1073750516266264e-08, "logits/chosen": -2.0040667057037354, "logits/rejected": -2.2671704292297363, "logps/chosen": -0.5208766460418701, "logps/rejected": -0.5719552636146545, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 1.1356148719787598, "rewards/margins": 0.006389498710632324, "rewards/rejected": 1.1292253732681274, "step": 5232 }, { "epoch": 2.82, "learning_rate": 2.105594091582747e-08, "logits/chosen": -2.0863311290740967, "logits/rejected": -2.351680040359497, "logps/chosen": -7.565057754516602, "logps/rejected": -10.467290878295898, "loss": 0.547, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957334399223328, "rewards/margins": 0.31733036041259766, "rewards/rejected": 0.6784030795097351, "step": 5233 }, { "epoch": 2.82, "learning_rate": 2.1038136836659527e-08, "logits/chosen": -1.9768664836883545, "logits/rejected": -1.9746308326721191, "logps/chosen": -0.7991410493850708, "logps/rejected": -3.7320327758789062, "loss": 0.5074, "rewards/accuracies": 1.0, "rewards/chosen": 1.1697295904159546, "rewards/margins": 0.41401952505111694, "rewards/rejected": 0.7557100653648376, "step": 5234 }, { "epoch": 2.82, "learning_rate": 2.1020338282158723e-08, "logits/chosen": -2.0672807693481445, "logits/rejected": -2.0606343746185303, "logps/chosen": -3.6200811862945557, "logps/rejected": -3.6189565658569336, "loss": 0.282, "rewards/accuracies": 1.0, "rewards/chosen": 1.6089372634887695, "rewards/margins": 1.12168550491333, "rewards/rejected": 0.48725175857543945, "step": 5235 }, { "epoch": 2.82, "learning_rate": 2.100254525572024e-08, "logits/chosen": -2.1489593982696533, "logits/rejected": -2.1379754543304443, "logps/chosen": -3.9038968086242676, "logps/rejected": -2.8389337062835693, "loss": 0.3375, "rewards/accuracies": 1.0, "rewards/chosen": 1.705025315284729, "rewards/margins": 0.9127292633056641, "rewards/rejected": 0.7922960519790649, "step": 5236 }, { "epoch": 2.82, "learning_rate": 2.0984757760738188e-08, "logits/chosen": -2.096928834915161, "logits/rejected": -2.0958008766174316, "logps/chosen": -4.873226165771484, "logps/rejected": -4.0184407234191895, "loss": 0.2489, "rewards/accuracies": 1.0, "rewards/chosen": 1.7141464948654175, "rewards/margins": 1.263866662979126, "rewards/rejected": 0.4502798616886139, "step": 5237 }, { "epoch": 2.83, "learning_rate": 2.0966975800605662e-08, "logits/chosen": -2.1347882747650146, "logits/rejected": -2.3045034408569336, "logps/chosen": -2.353630304336548, "logps/rejected": -1.2489880323410034, "loss": 0.6682, "rewards/accuracies": 1.0, "rewards/chosen": 1.0288337469100952, "rewards/margins": 0.050582945346832275, "rewards/rejected": 0.9782508015632629, "step": 5238 }, { "epoch": 2.83, "learning_rate": 2.0949199378714693e-08, "logits/chosen": -2.10396671295166, "logits/rejected": -2.0994527339935303, "logps/chosen": -12.003190994262695, "logps/rejected": -1.0968314409255981, "loss": 0.7816, "rewards/accuracies": 0.0, "rewards/chosen": 0.818098247051239, "rewards/margins": -0.16966557502746582, "rewards/rejected": 0.9877638220787048, "step": 5239 }, { "epoch": 2.83, "learning_rate": 2.0931428498456256e-08, "logits/chosen": -2.138782024383545, "logits/rejected": -2.0315966606140137, "logps/chosen": -17.45261001586914, "logps/rejected": -5.417752265930176, "loss": 0.3068, "rewards/accuracies": 1.0, "rewards/chosen": 1.6566390991210938, "rewards/margins": 1.0243053436279297, "rewards/rejected": 0.6323337554931641, "step": 5240 }, { "epoch": 2.83, "learning_rate": 2.091366316322025e-08, "logits/chosen": -2.1752328872680664, "logits/rejected": -2.180643081665039, "logps/chosen": -2.0627758502960205, "logps/rejected": -5.117713928222656, "loss": 0.3303, "rewards/accuracies": 1.0, "rewards/chosen": 1.304521083831787, "rewards/margins": 0.938237190246582, "rewards/rejected": 0.3662838935852051, "step": 5241 }, { "epoch": 2.83, "learning_rate": 2.089590337639554e-08, "logits/chosen": -2.066894769668579, "logits/rejected": -2.276758909225464, "logps/chosen": -0.24280574917793274, "logps/rejected": -0.2722780704498291, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.9391011595726013, "rewards/margins": 0.03184854984283447, "rewards/rejected": 0.9072526097297668, "step": 5242 }, { "epoch": 2.83, "learning_rate": 2.0878149141369923e-08, "logits/chosen": -2.002640724182129, "logits/rejected": -1.9787864685058594, "logps/chosen": -14.49215030670166, "logps/rejected": -2.5839338302612305, "loss": 0.2308, "rewards/accuracies": 1.0, "rewards/chosen": 2.0256919860839844, "rewards/margins": 1.3486969470977783, "rewards/rejected": 0.6769950985908508, "step": 5243 }, { "epoch": 2.83, "learning_rate": 2.0860400461530132e-08, "logits/chosen": -2.1983823776245117, "logits/rejected": -2.278130531311035, "logps/chosen": -5.355426788330078, "logps/rejected": -2.7509384155273438, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.6810552477836609, "rewards/margins": 0.00902557373046875, "rewards/rejected": 0.6720296740531921, "step": 5244 }, { "epoch": 2.83, "learning_rate": 2.0842657340261833e-08, "logits/chosen": -2.190176486968994, "logits/rejected": -2.2848715782165527, "logps/chosen": -6.103134632110596, "logps/rejected": -1.7399978637695312, "loss": 0.7624, "rewards/accuracies": 0.0, "rewards/chosen": 0.7275211811065674, "rewards/margins": -0.13396304845809937, "rewards/rejected": 0.8614842295646667, "step": 5245 }, { "epoch": 2.83, "learning_rate": 2.0824919780949674e-08, "logits/chosen": -2.0255346298217773, "logits/rejected": -2.024745464324951, "logps/chosen": -9.790212631225586, "logps/rejected": -0.9872841835021973, "loss": 0.488, "rewards/accuracies": 1.0, "rewards/chosen": 1.3345991373062134, "rewards/margins": 0.46365129947662354, "rewards/rejected": 0.8709478378295898, "step": 5246 }, { "epoch": 2.83, "learning_rate": 2.0807187786977192e-08, "logits/chosen": -1.9862570762634277, "logits/rejected": -2.301445960998535, "logps/chosen": -0.3130282461643219, "logps/rejected": -0.2924286127090454, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.8130610585212708, "rewards/margins": 0.010759353637695312, "rewards/rejected": 0.8023017048835754, "step": 5247 }, { "epoch": 2.83, "learning_rate": 2.0789461361726896e-08, "logits/chosen": -2.059457778930664, "logits/rejected": -2.2943804264068604, "logps/chosen": -0.3021228611469269, "logps/rejected": -1.8398778438568115, "loss": 0.603, "rewards/accuracies": 1.0, "rewards/chosen": 1.0344905853271484, "rewards/margins": 0.18924999237060547, "rewards/rejected": 0.845240592956543, "step": 5248 }, { "epoch": 2.83, "learning_rate": 2.0771740508580203e-08, "logits/chosen": -2.1872990131378174, "logits/rejected": -2.2142794132232666, "logps/chosen": -14.173666000366211, "logps/rejected": -3.29021954536438, "loss": 0.3439, "rewards/accuracies": 1.0, "rewards/chosen": 1.9633005857467651, "rewards/margins": 0.890417218208313, "rewards/rejected": 1.0728833675384521, "step": 5249 }, { "epoch": 2.83, "learning_rate": 2.0754025230917493e-08, "logits/chosen": -2.055291175842285, "logits/rejected": -2.315091609954834, "logps/chosen": -2.122519016265869, "logps/rejected": -1.7716248035430908, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 1.2118829488754272, "rewards/margins": 0.025589346885681152, "rewards/rejected": 1.186293601989746, "step": 5250 }, { "epoch": 2.83, "learning_rate": 2.073631553211807e-08, "logits/chosen": -2.0596578121185303, "logits/rejected": -2.0682592391967773, "logps/chosen": -1.1544311046600342, "logps/rejected": -3.8505115509033203, "loss": 0.4461, "rewards/accuracies": 1.0, "rewards/chosen": 1.0700749158859253, "rewards/margins": 0.5758621692657471, "rewards/rejected": 0.49421271681785583, "step": 5251 }, { "epoch": 2.83, "learning_rate": 2.0718611415560178e-08, "logits/chosen": -1.990417242050171, "logits/rejected": -2.2491447925567627, "logps/chosen": -0.15168075263500214, "logps/rejected": -0.1852770745754242, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.8461228609085083, "rewards/margins": 0.029817581176757812, "rewards/rejected": 0.8163052797317505, "step": 5252 }, { "epoch": 2.83, "learning_rate": 2.0700912884621003e-08, "logits/chosen": -2.090707302093506, "logits/rejected": -2.27590012550354, "logps/chosen": -0.12312190234661102, "logps/rejected": -0.11982407420873642, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.8761852383613586, "rewards/margins": 0.040449440479278564, "rewards/rejected": 0.8357357978820801, "step": 5253 }, { "epoch": 2.83, "learning_rate": 2.0683219942676633e-08, "logits/chosen": -2.042851209640503, "logits/rejected": -2.2638168334960938, "logps/chosen": -0.3165382742881775, "logps/rejected": -0.3447933793067932, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8727337121963501, "rewards/margins": 0.02289295196533203, "rewards/rejected": 0.8498407602310181, "step": 5254 }, { "epoch": 2.83, "learning_rate": 2.0665532593102125e-08, "logits/chosen": -2.189354419708252, "logits/rejected": -2.052839756011963, "logps/chosen": -29.223323822021484, "logps/rejected": -18.61075210571289, "loss": 0.2462, "rewards/accuracies": 1.0, "rewards/chosen": 2.0018856525421143, "rewards/margins": 1.2759337425231934, "rewards/rejected": 0.7259519696235657, "step": 5255 }, { "epoch": 2.83, "learning_rate": 2.064785083927144e-08, "logits/chosen": -2.061035394668579, "logits/rejected": -2.291914939880371, "logps/chosen": -0.3261704742908478, "logps/rejected": -0.33541375398635864, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8089480400085449, "rewards/margins": 0.015955805778503418, "rewards/rejected": 0.7929922342300415, "step": 5256 }, { "epoch": 2.84, "learning_rate": 2.063017468455753e-08, "logits/chosen": -2.0383236408233643, "logits/rejected": -2.2515125274658203, "logps/chosen": -0.19473758339881897, "logps/rejected": -0.27966466546058655, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8965773582458496, "rewards/margins": 0.021270334720611572, "rewards/rejected": 0.875307023525238, "step": 5257 }, { "epoch": 2.84, "learning_rate": 2.0612504132332214e-08, "logits/chosen": -2.0686967372894287, "logits/rejected": -2.2280380725860596, "logps/chosen": -4.315158843994141, "logps/rejected": -0.5495297312736511, "loss": 0.8061, "rewards/accuracies": 0.0, "rewards/chosen": 0.5023657083511353, "rewards/margins": -0.21448445320129395, "rewards/rejected": 0.7168501615524292, "step": 5258 }, { "epoch": 2.84, "learning_rate": 2.0594839185966277e-08, "logits/chosen": -2.180265426635742, "logits/rejected": -2.339189052581787, "logps/chosen": -3.4022293090820312, "logps/rejected": -2.8483057022094727, "loss": 0.72, "rewards/accuracies": 0.0, "rewards/chosen": 0.965006947517395, "rewards/margins": -0.05296444892883301, "rewards/rejected": 1.017971396446228, "step": 5259 }, { "epoch": 2.84, "learning_rate": 2.0577179848829412e-08, "logits/chosen": -2.1748039722442627, "logits/rejected": -2.3187222480773926, "logps/chosen": -0.9503727555274963, "logps/rejected": -0.9221614599227905, "loss": 0.7065, "rewards/accuracies": 0.0, "rewards/chosen": 1.0709525346755981, "rewards/margins": -0.026470303535461426, "rewards/rejected": 1.0974228382110596, "step": 5260 }, { "epoch": 2.84, "learning_rate": 2.0559526124290267e-08, "logits/chosen": -2.028555393218994, "logits/rejected": -2.034269332885742, "logps/chosen": -1.346617579460144, "logps/rejected": -5.608351707458496, "loss": 0.3871, "rewards/accuracies": 1.0, "rewards/chosen": 1.1525081396102905, "rewards/margins": 0.7491465210914612, "rewards/rejected": 0.40336161851882935, "step": 5261 }, { "epoch": 2.84, "learning_rate": 2.0541878015716396e-08, "logits/chosen": -2.1055095195770264, "logits/rejected": -2.3021910190582275, "logps/chosen": -0.1436435580253601, "logps/rejected": -0.12674526870250702, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 0.910014271736145, "rewards/margins": 0.05030834674835205, "rewards/rejected": 0.859705924987793, "step": 5262 }, { "epoch": 2.84, "learning_rate": 2.05242355264743e-08, "logits/chosen": -2.0602290630340576, "logits/rejected": -2.2828011512756348, "logps/chosen": -0.3466489315032959, "logps/rejected": -0.4493885636329651, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9849490523338318, "rewards/margins": 0.026166677474975586, "rewards/rejected": 0.9587823748588562, "step": 5263 }, { "epoch": 2.84, "learning_rate": 2.050659865992939e-08, "logits/chosen": -2.003312587738037, "logits/rejected": -2.0068087577819824, "logps/chosen": -0.7809813022613525, "logps/rejected": -5.1704487800598145, "loss": 0.4429, "rewards/accuracies": 1.0, "rewards/chosen": 1.0845369100570679, "rewards/margins": 0.5847735404968262, "rewards/rejected": 0.4997633397579193, "step": 5264 }, { "epoch": 2.84, "learning_rate": 2.048896741944603e-08, "logits/chosen": -1.9713454246520996, "logits/rejected": -2.277860641479492, "logps/chosen": -2.239661693572998, "logps/rejected": -2.365372657775879, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.8875154852867126, "rewards/margins": -0.00279080867767334, "rewards/rejected": 0.890306293964386, "step": 5265 }, { "epoch": 2.84, "learning_rate": 2.0471341808387487e-08, "logits/chosen": -2.041482448577881, "logits/rejected": -2.2479653358459473, "logps/chosen": -3.4853615760803223, "logps/rejected": -5.449366092681885, "loss": 0.5777, "rewards/accuracies": 1.0, "rewards/chosen": 0.9538292288780212, "rewards/margins": 0.24588125944137573, "rewards/rejected": 0.7079479694366455, "step": 5266 }, { "epoch": 2.84, "learning_rate": 2.0453721830115965e-08, "logits/chosen": -2.1465342044830322, "logits/rejected": -2.3148386478424072, "logps/chosen": -0.26594915986061096, "logps/rejected": -0.33229848742485046, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9684686660766602, "rewards/margins": 0.016543090343475342, "rewards/rejected": 0.9519255757331848, "step": 5267 }, { "epoch": 2.84, "learning_rate": 2.0436107487992588e-08, "logits/chosen": -2.188056468963623, "logits/rejected": -2.314943552017212, "logps/chosen": -1.011564016342163, "logps/rejected": -1.0092604160308838, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 1.0803946256637573, "rewards/margins": 0.010708332061767578, "rewards/rejected": 1.0696862936019897, "step": 5268 }, { "epoch": 2.84, "learning_rate": 2.0418498785377414e-08, "logits/chosen": -2.1712610721588135, "logits/rejected": -2.3666365146636963, "logps/chosen": -0.584898829460144, "logps/rejected": -0.6698904633522034, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 1.0799365043640137, "rewards/margins": 0.010912299156188965, "rewards/rejected": 1.0690242052078247, "step": 5269 }, { "epoch": 2.84, "learning_rate": 2.0400895725629414e-08, "logits/chosen": -2.0241475105285645, "logits/rejected": -2.0101447105407715, "logps/chosen": -8.897920608520508, "logps/rejected": -8.113304138183594, "loss": 0.203, "rewards/accuracies": 1.0, "rewards/chosen": 2.216843366622925, "rewards/margins": 1.4914593696594238, "rewards/rejected": 0.7253839373588562, "step": 5270 }, { "epoch": 2.84, "learning_rate": 2.0383298312106495e-08, "logits/chosen": -2.170703172683716, "logits/rejected": -2.164987325668335, "logps/chosen": -3.6390368938446045, "logps/rejected": -9.215826034545898, "loss": 0.2458, "rewards/accuracies": 1.0, "rewards/chosen": 1.4463505744934082, "rewards/margins": 1.2776029109954834, "rewards/rejected": 0.16874761879444122, "step": 5271 }, { "epoch": 2.84, "learning_rate": 2.0365706548165472e-08, "logits/chosen": -2.0836358070373535, "logits/rejected": -2.236970901489258, "logps/chosen": -1.0775326490402222, "logps/rejected": -0.9409889578819275, "loss": 0.7096, "rewards/accuracies": 0.0, "rewards/chosen": 0.9522411227226257, "rewards/margins": -0.03264009952545166, "rewards/rejected": 0.9848812222480774, "step": 5272 }, { "epoch": 2.84, "learning_rate": 2.0348120437162098e-08, "logits/chosen": -2.0949530601501465, "logits/rejected": -2.1049792766571045, "logps/chosen": -3.801914691925049, "logps/rejected": -11.34952449798584, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 0.8374679684638977, "rewards/margins": 0.04941624402999878, "rewards/rejected": 0.7880517244338989, "step": 5273 }, { "epoch": 2.84, "learning_rate": 2.0330539982451033e-08, "logits/chosen": -2.1119372844696045, "logits/rejected": -2.30356502532959, "logps/chosen": -0.9512047171592712, "logps/rejected": -6.497584819793701, "loss": 0.5523, "rewards/accuracies": 1.0, "rewards/chosen": 1.0324249267578125, "rewards/margins": 0.3048398494720459, "rewards/rejected": 0.7275850772857666, "step": 5274 }, { "epoch": 2.85, "learning_rate": 2.0312965187385873e-08, "logits/chosen": -2.171480178833008, "logits/rejected": -2.147254467010498, "logps/chosen": -7.385047912597656, "logps/rejected": -2.6851916313171387, "loss": 0.239, "rewards/accuracies": 1.0, "rewards/chosen": 1.9674946069717407, "rewards/margins": 1.3093889951705933, "rewards/rejected": 0.6581056118011475, "step": 5275 }, { "epoch": 2.85, "learning_rate": 2.029539605531912e-08, "logits/chosen": -2.1202352046966553, "logits/rejected": -2.116405487060547, "logps/chosen": -0.8175605535507202, "logps/rejected": -3.5076305866241455, "loss": 0.4606, "rewards/accuracies": 1.0, "rewards/chosen": 1.1864222288131714, "rewards/margins": 0.5361058712005615, "rewards/rejected": 0.6503163576126099, "step": 5276 }, { "epoch": 2.85, "learning_rate": 2.0277832589602205e-08, "logits/chosen": -2.1764652729034424, "logits/rejected": -2.349541425704956, "logps/chosen": -1.2677336931228638, "logps/rejected": -1.2348198890686035, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 1.196772813796997, "rewards/margins": 0.02721726894378662, "rewards/rejected": 1.1695555448532104, "step": 5277 }, { "epoch": 2.85, "learning_rate": 2.0260274793585475e-08, "logits/chosen": -2.141693115234375, "logits/rejected": -2.3502368927001953, "logps/chosen": -4.350564479827881, "logps/rejected": -4.0293474197387695, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.771332323551178, "rewards/margins": -0.0053907036781311035, "rewards/rejected": 0.7767230272293091, "step": 5278 }, { "epoch": 2.85, "learning_rate": 2.0242722670618178e-08, "logits/chosen": -2.0200417041778564, "logits/rejected": -2.3134262561798096, "logps/chosen": -2.6285722255706787, "logps/rejected": -2.64190673828125, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 0.5194470286369324, "rewards/margins": 0.04626855254173279, "rewards/rejected": 0.4731784760951996, "step": 5279 }, { "epoch": 2.85, "learning_rate": 2.0225176224048533e-08, "logits/chosen": -2.2215688228607178, "logits/rejected": -2.384545087814331, "logps/chosen": -4.507283687591553, "logps/rejected": -0.7657930254936218, "loss": 0.8438, "rewards/accuracies": 0.0, "rewards/chosen": 0.8794294595718384, "rewards/margins": -0.28162193298339844, "rewards/rejected": 1.1610513925552368, "step": 5280 }, { "epoch": 2.85, "learning_rate": 2.020763545722362e-08, "logits/chosen": -2.107041835784912, "logits/rejected": -2.116044759750366, "logps/chosen": -0.3638431131839752, "logps/rejected": -12.803877830505371, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 1.0410888195037842, "rewards/margins": 0.3626108765602112, "rewards/rejected": 0.678477942943573, "step": 5281 }, { "epoch": 2.85, "learning_rate": 2.0190100373489467e-08, "logits/chosen": -2.139774799346924, "logits/rejected": -2.3444676399230957, "logps/chosen": -1.2864537239074707, "logps/rejected": -0.8853126764297485, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9903465509414673, "rewards/margins": 0.0011929869651794434, "rewards/rejected": 0.9891535639762878, "step": 5282 }, { "epoch": 2.85, "learning_rate": 2.0172570976191013e-08, "logits/chosen": -2.0785577297210693, "logits/rejected": -2.2915451526641846, "logps/chosen": -5.442335605621338, "logps/rejected": -2.594559669494629, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9541398882865906, "rewards/margins": 0.005106329917907715, "rewards/rejected": 0.9490335583686829, "step": 5283 }, { "epoch": 2.85, "learning_rate": 2.0155047268672077e-08, "logits/chosen": -2.1686649322509766, "logits/rejected": -2.1697704792022705, "logps/chosen": -0.12302348762750626, "logps/rejected": -3.931715250015259, "loss": 0.4553, "rewards/accuracies": 1.0, "rewards/chosen": 1.0330297946929932, "rewards/margins": 0.5503948926925659, "rewards/rejected": 0.48263493180274963, "step": 5284 }, { "epoch": 2.85, "learning_rate": 2.0137529254275433e-08, "logits/chosen": -1.9460811614990234, "logits/rejected": -1.9536501169204712, "logps/chosen": -1.3236562013626099, "logps/rejected": -5.454648494720459, "loss": 0.4168, "rewards/accuracies": 1.0, "rewards/chosen": 0.9900081753730774, "rewards/margins": 0.6596492528915405, "rewards/rejected": 0.33035895228385925, "step": 5285 }, { "epoch": 2.85, "learning_rate": 2.012001693634277e-08, "logits/chosen": -2.0241620540618896, "logits/rejected": -2.0338387489318848, "logps/chosen": -1.2357261180877686, "logps/rejected": -4.355887413024902, "loss": 0.4521, "rewards/accuracies": 1.0, "rewards/chosen": 0.9980325698852539, "rewards/margins": 0.5592085123062134, "rewards/rejected": 0.4388240873813629, "step": 5286 }, { "epoch": 2.85, "learning_rate": 2.0102510318214672e-08, "logits/chosen": -2.057260036468506, "logits/rejected": -2.0567476749420166, "logps/chosen": -5.404446601867676, "logps/rejected": -3.0536367893218994, "loss": 0.2876, "rewards/accuracies": 1.0, "rewards/chosen": 1.651105523109436, "rewards/margins": 1.0987468957901, "rewards/rejected": 0.5523586273193359, "step": 5287 }, { "epoch": 2.85, "learning_rate": 2.008500940323064e-08, "logits/chosen": -2.008439779281616, "logits/rejected": -2.0097808837890625, "logps/chosen": -6.040658473968506, "logps/rejected": -1.3950515985488892, "loss": 0.4199, "rewards/accuracies": 1.0, "rewards/chosen": 1.2368658781051636, "rewards/margins": 0.6505296230316162, "rewards/rejected": 0.5863362550735474, "step": 5288 }, { "epoch": 2.85, "learning_rate": 2.006751419472909e-08, "logits/chosen": -2.0678069591522217, "logits/rejected": -2.278259038925171, "logps/chosen": -0.25427255034446716, "logps/rejected": -0.21599262952804565, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.8181523680686951, "rewards/margins": 0.010417759418487549, "rewards/rejected": 0.8077346086502075, "step": 5289 }, { "epoch": 2.85, "learning_rate": 2.005002469604734e-08, "logits/chosen": -2.233201026916504, "logits/rejected": -2.0499062538146973, "logps/chosen": -31.922502517700195, "logps/rejected": -3.6758131980895996, "loss": 0.0647, "rewards/accuracies": 1.0, "rewards/chosen": 3.178004741668701, "rewards/margins": 2.7051196098327637, "rewards/rejected": 0.4728851020336151, "step": 5290 }, { "epoch": 2.85, "learning_rate": 2.003254091052165e-08, "logits/chosen": -2.0900089740753174, "logits/rejected": -2.085752010345459, "logps/chosen": -2.411932945251465, "logps/rejected": -6.320532321929932, "loss": 0.4574, "rewards/accuracies": 1.0, "rewards/chosen": 0.9939886331558228, "rewards/margins": 0.5448359251022339, "rewards/rejected": 0.44915270805358887, "step": 5291 }, { "epoch": 2.85, "learning_rate": 2.0015062841487162e-08, "logits/chosen": -2.2501370906829834, "logits/rejected": -2.2506394386291504, "logps/chosen": -0.4032822251319885, "logps/rejected": -4.890387058258057, "loss": 0.4559, "rewards/accuracies": 1.0, "rewards/chosen": 0.7364599108695984, "rewards/margins": 0.5489237308502197, "rewards/rejected": 0.18753615021705627, "step": 5292 }, { "epoch": 2.85, "learning_rate": 1.9997590492277926e-08, "logits/chosen": -2.0602333545684814, "logits/rejected": -2.0664005279541016, "logps/chosen": -1.9839986562728882, "logps/rejected": -5.022469520568848, "loss": 0.4323, "rewards/accuracies": 1.0, "rewards/chosen": 1.1834183931350708, "rewards/margins": 0.6148313879966736, "rewards/rejected": 0.5685870051383972, "step": 5293 }, { "epoch": 2.86, "learning_rate": 1.9980123866226913e-08, "logits/chosen": -2.1995394229888916, "logits/rejected": -2.198336362838745, "logps/chosen": -0.4982338845729828, "logps/rejected": -2.722961187362671, "loss": 0.5148, "rewards/accuracies": 1.0, "rewards/chosen": 1.0927008390426636, "rewards/margins": 0.3954765200614929, "rewards/rejected": 0.6972243189811707, "step": 5294 }, { "epoch": 2.86, "learning_rate": 1.9962662966665988e-08, "logits/chosen": -2.1505706310272217, "logits/rejected": -2.064103364944458, "logps/chosen": -12.203927993774414, "logps/rejected": -4.130691051483154, "loss": 0.3164, "rewards/accuracies": 1.0, "rewards/chosen": 1.7418088912963867, "rewards/margins": 0.9883666634559631, "rewards/rejected": 0.7534422278404236, "step": 5295 }, { "epoch": 2.86, "learning_rate": 1.994520779692594e-08, "logits/chosen": -2.103515863418579, "logits/rejected": -2.130763292312622, "logps/chosen": -17.012767791748047, "logps/rejected": -14.71993637084961, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": 2.497335433959961, "rewards/margins": 2.1608028411865234, "rewards/rejected": 0.3365325927734375, "step": 5296 }, { "epoch": 2.86, "learning_rate": 1.9927758360336462e-08, "logits/chosen": -2.2166624069213867, "logits/rejected": -2.2186272144317627, "logps/chosen": -1.1168745756149292, "logps/rejected": -3.311915636062622, "loss": 0.4614, "rewards/accuracies": 1.0, "rewards/chosen": 1.2839888334274292, "rewards/margins": 0.5338909029960632, "rewards/rejected": 0.750097930431366, "step": 5297 }, { "epoch": 2.86, "learning_rate": 1.9910314660226146e-08, "logits/chosen": -2.005744457244873, "logits/rejected": -2.006033420562744, "logps/chosen": -1.8908010721206665, "logps/rejected": -0.9091497659683228, "loss": 0.7098, "rewards/accuracies": 0.0, "rewards/chosen": 1.0235141515731812, "rewards/margins": -0.033091068267822266, "rewards/rejected": 1.0566052198410034, "step": 5298 }, { "epoch": 2.86, "learning_rate": 1.989287669992249e-08, "logits/chosen": -2.098217010498047, "logits/rejected": -2.0770437717437744, "logps/chosen": -7.27831506729126, "logps/rejected": -1.7341388463974, "loss": 0.332, "rewards/accuracies": 1.0, "rewards/chosen": 1.7645654678344727, "rewards/margins": 0.9319884181022644, "rewards/rejected": 0.8325770497322083, "step": 5299 }, { "epoch": 2.86, "learning_rate": 1.987544448275192e-08, "logits/chosen": -2.084249496459961, "logits/rejected": -2.073744297027588, "logps/chosen": -0.2304675579071045, "logps/rejected": -7.6883344650268555, "loss": 0.4142, "rewards/accuracies": 1.0, "rewards/chosen": 0.9359885454177856, "rewards/margins": 0.6671645641326904, "rewards/rejected": 0.2688240110874176, "step": 5300 }, { "epoch": 2.86, "learning_rate": 1.9858018012039697e-08, "logits/chosen": -2.175102710723877, "logits/rejected": -2.170546293258667, "logps/chosen": -3.0980892181396484, "logps/rejected": -4.079869270324707, "loss": 0.3252, "rewards/accuracies": 1.0, "rewards/chosen": 1.5851675271987915, "rewards/margins": 0.9563120007514954, "rewards/rejected": 0.6288555264472961, "step": 5301 }, { "epoch": 2.86, "learning_rate": 1.9840597291110083e-08, "logits/chosen": -2.057138204574585, "logits/rejected": -2.0432636737823486, "logps/chosen": -1.6222742795944214, "logps/rejected": -5.420626163482666, "loss": 0.4318, "rewards/accuracies": 1.0, "rewards/chosen": 1.437689185142517, "rewards/margins": 0.6160150170326233, "rewards/rejected": 0.8216741681098938, "step": 5302 }, { "epoch": 2.86, "learning_rate": 1.9823182323286185e-08, "logits/chosen": -2.006383180618286, "logits/rejected": -2.014592170715332, "logps/chosen": -1.2450611591339111, "logps/rejected": -3.6737608909606934, "loss": 0.444, "rewards/accuracies": 1.0, "rewards/chosen": 1.044852614402771, "rewards/margins": 0.5816668272018433, "rewards/rejected": 0.46318575739860535, "step": 5303 }, { "epoch": 2.86, "learning_rate": 1.9805773111890017e-08, "logits/chosen": -2.074727773666382, "logits/rejected": -2.078253746032715, "logps/chosen": -5.363028049468994, "logps/rejected": -4.138192653656006, "loss": 0.5491, "rewards/accuracies": 1.0, "rewards/chosen": 0.9136589169502258, "rewards/margins": 0.3124728798866272, "rewards/rejected": 0.6011860370635986, "step": 5304 }, { "epoch": 2.86, "learning_rate": 1.978836966024251e-08, "logits/chosen": -1.982587218284607, "logits/rejected": -1.9828808307647705, "logps/chosen": -0.08787524700164795, "logps/rejected": -8.363466262817383, "loss": 0.4631, "rewards/accuracies": 1.0, "rewards/chosen": 0.8330618143081665, "rewards/margins": 0.5293477177619934, "rewards/rejected": 0.3037140965461731, "step": 5305 }, { "epoch": 2.86, "learning_rate": 1.9770971971663475e-08, "logits/chosen": -2.112273693084717, "logits/rejected": -2.265791893005371, "logps/chosen": -0.45386916399002075, "logps/rejected": -0.5630219578742981, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0448558330535889, "rewards/margins": 0.01909816265106201, "rewards/rejected": 1.0257576704025269, "step": 5306 }, { "epoch": 2.86, "learning_rate": 1.9753580049471645e-08, "logits/chosen": -2.1346373558044434, "logits/rejected": -2.2592780590057373, "logps/chosen": -0.15533816814422607, "logps/rejected": -0.2222476601600647, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.9088563919067383, "rewards/margins": 0.03130149841308594, "rewards/rejected": 0.8775548934936523, "step": 5307 }, { "epoch": 2.86, "learning_rate": 1.973619389698464e-08, "logits/chosen": -2.0413730144500732, "logits/rejected": -2.010800838470459, "logps/chosen": -4.978940010070801, "logps/rejected": -4.3161468505859375, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 1.50374174118042, "rewards/margins": 1.0832114219665527, "rewards/rejected": 0.4205303192138672, "step": 5308 }, { "epoch": 2.86, "learning_rate": 1.9718813517518995e-08, "logits/chosen": -2.0692431926727295, "logits/rejected": -2.314326524734497, "logps/chosen": -0.7770639657974243, "logps/rejected": -0.6567245125770569, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 1.0289815664291382, "rewards/margins": 0.025505781173706055, "rewards/rejected": 1.0034757852554321, "step": 5309 }, { "epoch": 2.86, "learning_rate": 1.970143891439012e-08, "logits/chosen": -1.9878162145614624, "logits/rejected": -2.0050339698791504, "logps/chosen": -0.6638566851615906, "logps/rejected": -13.986181259155273, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": 0.940203845500946, "rewards/margins": 0.044691264629364014, "rewards/rejected": 0.895512580871582, "step": 5310 }, { "epoch": 2.86, "learning_rate": 1.968407009091233e-08, "logits/chosen": -2.071183204650879, "logits/rejected": -2.0722036361694336, "logps/chosen": -4.902407646179199, "logps/rejected": -5.145462512969971, "loss": 0.2687, "rewards/accuracies": 1.0, "rewards/chosen": 1.533821940422058, "rewards/margins": 1.176642894744873, "rewards/rejected": 0.35717901587486267, "step": 5311 }, { "epoch": 2.87, "learning_rate": 1.9666707050398857e-08, "logits/chosen": -2.1294894218444824, "logits/rejected": -2.2496848106384277, "logps/chosen": -1.0283682346343994, "logps/rejected": -1.1298742294311523, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.915562629699707, "rewards/margins": 0.03442627191543579, "rewards/rejected": 0.8811363577842712, "step": 5312 }, { "epoch": 2.87, "learning_rate": 1.9649349796161814e-08, "logits/chosen": -2.0029985904693604, "logits/rejected": -2.287290573120117, "logps/chosen": -0.34125202894210815, "logps/rejected": -0.38365495204925537, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8885602355003357, "rewards/margins": 0.009817838668823242, "rewards/rejected": 0.8787423968315125, "step": 5313 }, { "epoch": 2.87, "learning_rate": 1.9631998331512208e-08, "logits/chosen": -2.098578453063965, "logits/rejected": -2.2719759941101074, "logps/chosen": -4.243177890777588, "logps/rejected": -4.068691253662109, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 1.068755030632019, "rewards/margins": 0.007536411285400391, "rewards/rejected": 1.0612186193466187, "step": 5314 }, { "epoch": 2.87, "learning_rate": 1.9614652659759944e-08, "logits/chosen": -2.1388936042785645, "logits/rejected": -2.082202672958374, "logps/chosen": -15.813620567321777, "logps/rejected": -4.403087615966797, "loss": 0.236, "rewards/accuracies": 1.0, "rewards/chosen": 1.843221664428711, "rewards/margins": 1.323385238647461, "rewards/rejected": 0.51983642578125, "step": 5315 }, { "epoch": 2.87, "learning_rate": 1.959731278421383e-08, "logits/chosen": -2.113266944885254, "logits/rejected": -2.11252760887146, "logps/chosen": -0.9258711934089661, "logps/rejected": -3.3526787757873535, "loss": 0.5505, "rewards/accuracies": 1.0, "rewards/chosen": 1.0114872455596924, "rewards/margins": 0.30913245677948, "rewards/rejected": 0.7023547887802124, "step": 5316 }, { "epoch": 2.87, "learning_rate": 1.9579978708181556e-08, "logits/chosen": -2.1510167121887207, "logits/rejected": -2.1571428775787354, "logps/chosen": -1.1561530828475952, "logps/rejected": -5.305259704589844, "loss": 0.3914, "rewards/accuracies": 1.0, "rewards/chosen": 1.1179107427597046, "rewards/margins": 0.735970139503479, "rewards/rejected": 0.3819405734539032, "step": 5317 }, { "epoch": 2.87, "learning_rate": 1.9562650434969718e-08, "logits/chosen": -2.0252342224121094, "logits/rejected": -2.0200979709625244, "logps/chosen": -4.018661975860596, "logps/rejected": -1.8558380603790283, "loss": 0.2942, "rewards/accuracies": 1.0, "rewards/chosen": 1.6801925897598267, "rewards/margins": 1.0726642608642578, "rewards/rejected": 0.6075283885002136, "step": 5318 }, { "epoch": 2.87, "learning_rate": 1.954532796788379e-08, "logits/chosen": -2.0391147136688232, "logits/rejected": -2.2773988246917725, "logps/chosen": -7.940470218658447, "logps/rejected": -6.548534870147705, "loss": 0.7414, "rewards/accuracies": 0.0, "rewards/chosen": 0.5342776775360107, "rewards/margins": -0.09422796964645386, "rewards/rejected": 0.6285056471824646, "step": 5319 }, { "epoch": 2.87, "learning_rate": 1.9528011310228154e-08, "logits/chosen": -2.033769369125366, "logits/rejected": -2.0765881538391113, "logps/chosen": -0.8342763185501099, "logps/rejected": -14.21649169921875, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.79853355884552, "rewards/margins": 0.027348041534423828, "rewards/rejected": 0.7711855173110962, "step": 5320 }, { "epoch": 2.87, "learning_rate": 1.951070046530608e-08, "logits/chosen": -2.2306692600250244, "logits/rejected": -2.3627331256866455, "logps/chosen": -1.2922537326812744, "logps/rejected": -1.3955276012420654, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 1.062111496925354, "rewards/margins": 0.026203632354736328, "rewards/rejected": 1.0359078645706177, "step": 5321 }, { "epoch": 2.87, "learning_rate": 1.949339543641972e-08, "logits/chosen": -2.1080853939056396, "logits/rejected": -2.139925003051758, "logps/chosen": -3.0829899311065674, "logps/rejected": -6.595308303833008, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 1.6386159658432007, "rewards/margins": 0.9122253060340881, "rewards/rejected": 0.7263906598091125, "step": 5322 }, { "epoch": 2.87, "learning_rate": 1.947609622687012e-08, "logits/chosen": -2.093555450439453, "logits/rejected": -2.095397710800171, "logps/chosen": -2.5328216552734375, "logps/rejected": -13.099493980407715, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 1.2970373630523682, "rewards/margins": 1.1704705953598022, "rewards/rejected": 0.1265667974948883, "step": 5323 }, { "epoch": 2.87, "learning_rate": 1.9458802839957207e-08, "logits/chosen": -2.16344952583313, "logits/rejected": -2.1577842235565186, "logps/chosen": -6.696433067321777, "logps/rejected": -6.329492092132568, "loss": 0.3521, "rewards/accuracies": 1.0, "rewards/chosen": 1.1256279945373535, "rewards/margins": 0.8625263571739197, "rewards/rejected": 0.26310163736343384, "step": 5324 }, { "epoch": 2.87, "learning_rate": 1.9441515278979847e-08, "logits/chosen": -2.102586269378662, "logits/rejected": -2.1039035320281982, "logps/chosen": -2.177138566970825, "logps/rejected": -4.837035179138184, "loss": 0.2787, "rewards/accuracies": 1.0, "rewards/chosen": 1.5739012956619263, "rewards/margins": 1.1349339485168457, "rewards/rejected": 0.4389673173427582, "step": 5325 }, { "epoch": 2.87, "learning_rate": 1.942423354723573e-08, "logits/chosen": -1.9964313507080078, "logits/rejected": -2.2630057334899902, "logps/chosen": -1.5018588304519653, "logps/rejected": -1.6091846227645874, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 0.9908923506736755, "rewards/margins": -0.01451951265335083, "rewards/rejected": 1.0054118633270264, "step": 5326 }, { "epoch": 2.87, "learning_rate": 1.940695764802147e-08, "logits/chosen": -2.1451940536499023, "logits/rejected": -2.3450562953948975, "logps/chosen": -0.03745806962251663, "logps/rejected": -0.03966715931892395, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.7691850066184998, "rewards/margins": 0.028079569339752197, "rewards/rejected": 0.7411054372787476, "step": 5327 }, { "epoch": 2.87, "learning_rate": 1.9389687584632552e-08, "logits/chosen": -2.0515928268432617, "logits/rejected": -2.284085512161255, "logps/chosen": -6.117739677429199, "logps/rejected": -4.295294761657715, "loss": 0.6487, "rewards/accuracies": 1.0, "rewards/chosen": 0.9121896028518677, "rewards/margins": 0.09097480773925781, "rewards/rejected": 0.8212147951126099, "step": 5328 }, { "epoch": 2.87, "learning_rate": 1.937242336036336e-08, "logits/chosen": -2.172626495361328, "logits/rejected": -2.172609329223633, "logps/chosen": -0.23220521211624146, "logps/rejected": -6.353573799133301, "loss": 0.4185, "rewards/accuracies": 1.0, "rewards/chosen": 0.8833133578300476, "rewards/margins": 0.6544911861419678, "rewards/rejected": 0.22882214188575745, "step": 5329 }, { "epoch": 2.87, "learning_rate": 1.935516497850717e-08, "logits/chosen": -2.0563604831695557, "logits/rejected": -2.3207013607025146, "logps/chosen": -0.5436163544654846, "logps/rejected": -0.5566728711128235, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.8353117108345032, "rewards/margins": 0.027967453002929688, "rewards/rejected": 0.8073442578315735, "step": 5330 }, { "epoch": 2.88, "learning_rate": 1.9337912442356107e-08, "logits/chosen": -2.0047202110290527, "logits/rejected": -2.0029518604278564, "logps/chosen": -2.4439358711242676, "logps/rejected": -4.102630615234375, "loss": 0.3827, "rewards/accuracies": 1.0, "rewards/chosen": 1.2296828031539917, "rewards/margins": 0.763185977935791, "rewards/rejected": 0.46649685502052307, "step": 5331 }, { "epoch": 2.88, "learning_rate": 1.9320665755201216e-08, "logits/chosen": -2.0473546981811523, "logits/rejected": -2.319361925125122, "logps/chosen": -0.41770386695861816, "logps/rejected": -0.5000771880149841, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 1.0631866455078125, "rewards/margins": -0.005002379417419434, "rewards/rejected": 1.068189024925232, "step": 5332 }, { "epoch": 2.88, "learning_rate": 1.9303424920332423e-08, "logits/chosen": -2.13055157661438, "logits/rejected": -2.132133960723877, "logps/chosen": -1.7141481637954712, "logps/rejected": -1.6267284154891968, "loss": 0.5613, "rewards/accuracies": 1.0, "rewards/chosen": 1.0185720920562744, "rewards/margins": 0.28378647565841675, "rewards/rejected": 0.7347856163978577, "step": 5333 }, { "epoch": 2.88, "learning_rate": 1.928618994103853e-08, "logits/chosen": -2.1350178718566895, "logits/rejected": -2.1349217891693115, "logps/chosen": -0.7902999520301819, "logps/rejected": -1.3925929069519043, "loss": 0.5508, "rewards/accuracies": 1.0, "rewards/chosen": 1.0720977783203125, "rewards/margins": 0.3083641529083252, "rewards/rejected": 0.7637336254119873, "step": 5334 }, { "epoch": 2.88, "learning_rate": 1.9268960820607193e-08, "logits/chosen": -2.157590627670288, "logits/rejected": -2.2023348808288574, "logps/chosen": -2.9015932083129883, "logps/rejected": -12.147872924804688, "loss": 0.261, "rewards/accuracies": 1.0, "rewards/chosen": 1.7340935468673706, "rewards/margins": 1.2098610401153564, "rewards/rejected": 0.5242325067520142, "step": 5335 }, { "epoch": 2.88, "learning_rate": 1.9251737562325038e-08, "logits/chosen": -2.0504841804504395, "logits/rejected": -2.035703182220459, "logps/chosen": -1.826176404953003, "logps/rejected": -4.367782115936279, "loss": 0.3663, "rewards/accuracies": 1.0, "rewards/chosen": 1.314934253692627, "rewards/margins": 0.8156194686889648, "rewards/rejected": 0.4993147552013397, "step": 5336 }, { "epoch": 2.88, "learning_rate": 1.923452016947748e-08, "logits/chosen": -2.0922000408172607, "logits/rejected": -2.0958242416381836, "logps/chosen": -0.38206759095191956, "logps/rejected": -7.1478986740112305, "loss": 0.4735, "rewards/accuracies": 1.0, "rewards/chosen": 0.9432536363601685, "rewards/margins": 0.5015276074409485, "rewards/rejected": 0.44172602891921997, "step": 5337 }, { "epoch": 2.88, "learning_rate": 1.9217308645348862e-08, "logits/chosen": -2.0759615898132324, "logits/rejected": -2.0733752250671387, "logps/chosen": -0.38692203164100647, "logps/rejected": -4.43862247467041, "loss": 0.471, "rewards/accuracies": 1.0, "rewards/chosen": 1.0070675611495972, "rewards/margins": 0.5081034898757935, "rewards/rejected": 0.4989640414714813, "step": 5338 }, { "epoch": 2.88, "learning_rate": 1.920010299322239e-08, "logits/chosen": -2.1397604942321777, "logits/rejected": -2.3073177337646484, "logps/chosen": -3.153181552886963, "logps/rejected": -1.154584527015686, "loss": 0.6409, "rewards/accuracies": 1.0, "rewards/chosen": 0.9979610443115234, "rewards/margins": 0.10745179653167725, "rewards/rejected": 0.8905092477798462, "step": 5339 }, { "epoch": 2.88, "learning_rate": 1.9182903216380154e-08, "logits/chosen": -2.04638934135437, "logits/rejected": -2.3031136989593506, "logps/chosen": -2.1379449367523193, "logps/rejected": -2.1925957202911377, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.8441882133483887, "rewards/margins": 0.010744631290435791, "rewards/rejected": 0.8334435820579529, "step": 5340 }, { "epoch": 2.88, "learning_rate": 1.9165709318103128e-08, "logits/chosen": -2.1143200397491455, "logits/rejected": -2.1134676933288574, "logps/chosen": -0.5841930508613586, "logps/rejected": -1.7660630941390991, "loss": 0.6223, "rewards/accuracies": 1.0, "rewards/chosen": 0.9640340209007263, "rewards/margins": 0.14708352088928223, "rewards/rejected": 0.8169505000114441, "step": 5341 }, { "epoch": 2.88, "learning_rate": 1.9148521301671172e-08, "logits/chosen": -2.021030902862549, "logits/rejected": -2.3192520141601562, "logps/chosen": -2.3736343383789062, "logps/rejected": -3.0952892303466797, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 1.0237947702407837, "rewards/margins": 0.051296889781951904, "rewards/rejected": 0.9724978804588318, "step": 5342 }, { "epoch": 2.88, "learning_rate": 1.9131339170363e-08, "logits/chosen": -2.0529723167419434, "logits/rejected": -2.0535008907318115, "logps/chosen": -1.6585218906402588, "logps/rejected": -4.620965003967285, "loss": 0.5813, "rewards/accuracies": 1.0, "rewards/chosen": 1.1617268323898315, "rewards/margins": 0.2377048134803772, "rewards/rejected": 0.9240220189094543, "step": 5343 }, { "epoch": 2.88, "learning_rate": 1.9114162927456218e-08, "logits/chosen": -2.050898313522339, "logits/rejected": -2.0555953979492188, "logps/chosen": -0.8270501494407654, "logps/rejected": -13.693650245666504, "loss": 0.4322, "rewards/accuracies": 1.0, "rewards/chosen": 1.0916991233825684, "rewards/margins": 0.6149465441703796, "rewards/rejected": 0.4767525792121887, "step": 5344 }, { "epoch": 2.88, "learning_rate": 1.9096992576227315e-08, "logits/chosen": -2.12484073638916, "logits/rejected": -2.153752088546753, "logps/chosen": -2.8685715198516846, "logps/rejected": -13.384539604187012, "loss": 0.2469, "rewards/accuracies": 1.0, "rewards/chosen": 1.5890141725540161, "rewards/margins": 1.2728351354599, "rewards/rejected": 0.3161790072917938, "step": 5345 }, { "epoch": 2.88, "learning_rate": 1.9079828119951645e-08, "logits/chosen": -2.0387215614318848, "logits/rejected": -2.034564733505249, "logps/chosen": -0.6026394963264465, "logps/rejected": -2.997737407684326, "loss": 0.6019, "rewards/accuracies": 1.0, "rewards/chosen": 0.9333357810974121, "rewards/margins": 0.1916932463645935, "rewards/rejected": 0.7416425347328186, "step": 5346 }, { "epoch": 2.88, "learning_rate": 1.906266956190345e-08, "logits/chosen": -2.0727972984313965, "logits/rejected": -2.0804808139801025, "logps/chosen": -5.1750898361206055, "logps/rejected": -1.155606985092163, "loss": 0.4312, "rewards/accuracies": 1.0, "rewards/chosen": 1.4018607139587402, "rewards/margins": 0.6177697777748108, "rewards/rejected": 0.7840909361839294, "step": 5347 }, { "epoch": 2.88, "learning_rate": 1.9045516905355823e-08, "logits/chosen": -1.9982341527938843, "logits/rejected": -1.998206377029419, "logps/chosen": -0.5295376181602478, "logps/rejected": -2.2221927642822266, "loss": 0.5225, "rewards/accuracies": 1.0, "rewards/chosen": 1.0231480598449707, "rewards/margins": 0.3765972852706909, "rewards/rejected": 0.6465507745742798, "step": 5348 }, { "epoch": 2.89, "learning_rate": 1.9028370153580757e-08, "logits/chosen": -2.0509746074676514, "logits/rejected": -2.053797483444214, "logps/chosen": -2.299581289291382, "logps/rejected": -1.0566600561141968, "loss": 0.484, "rewards/accuracies": 1.0, "rewards/chosen": 1.2937678098678589, "rewards/margins": 0.4738842844963074, "rewards/rejected": 0.8198835253715515, "step": 5349 }, { "epoch": 2.89, "learning_rate": 1.901122930984911e-08, "logits/chosen": -2.116075038909912, "logits/rejected": -2.1136107444763184, "logps/chosen": -3.0278849601745605, "logps/rejected": -2.7839038372039795, "loss": 0.4951, "rewards/accuracies": 1.0, "rewards/chosen": 1.186610460281372, "rewards/margins": 0.44516950845718384, "rewards/rejected": 0.7414409518241882, "step": 5350 }, { "epoch": 2.89, "learning_rate": 1.8994094377430603e-08, "logits/chosen": -2.1043624877929688, "logits/rejected": -2.1097729206085205, "logps/chosen": -2.0970144271850586, "logps/rejected": -4.625880718231201, "loss": 0.4472, "rewards/accuracies": 1.0, "rewards/chosen": 1.0441631078720093, "rewards/margins": 0.5728724002838135, "rewards/rejected": 0.4712907373905182, "step": 5351 }, { "epoch": 2.89, "learning_rate": 1.897696535959385e-08, "logits/chosen": -2.176408290863037, "logits/rejected": -2.3355467319488525, "logps/chosen": -1.6798145771026611, "logps/rejected": -1.6096482276916504, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 1.0222874879837036, "rewards/margins": 0.017134785652160645, "rewards/rejected": 1.005152702331543, "step": 5352 }, { "epoch": 2.89, "learning_rate": 1.8959842259606312e-08, "logits/chosen": -1.9349830150604248, "logits/rejected": -2.2400007247924805, "logps/chosen": -0.11295580863952637, "logps/rejected": -0.11844804883003235, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.8219934701919556, "rewards/margins": 0.009262025356292725, "rewards/rejected": 0.8127314448356628, "step": 5353 }, { "epoch": 2.89, "learning_rate": 1.894272508073434e-08, "logits/chosen": -2.1013119220733643, "logits/rejected": -2.0878891944885254, "logps/chosen": -5.5069780349731445, "logps/rejected": -8.433608055114746, "loss": 0.2571, "rewards/accuracies": 1.0, "rewards/chosen": 1.3783278465270996, "rewards/margins": 1.227064847946167, "rewards/rejected": 0.15126295387744904, "step": 5354 }, { "epoch": 2.89, "learning_rate": 1.8925613826243148e-08, "logits/chosen": -2.1694531440734863, "logits/rejected": -2.167372226715088, "logps/chosen": -20.22134780883789, "logps/rejected": -8.01984691619873, "loss": 0.2676, "rewards/accuracies": 1.0, "rewards/chosen": 1.6704044342041016, "rewards/margins": 1.1816580295562744, "rewards/rejected": 0.4887464642524719, "step": 5355 }, { "epoch": 2.89, "learning_rate": 1.8908508499396818e-08, "logits/chosen": -2.0705747604370117, "logits/rejected": -2.0730373859405518, "logps/chosen": -4.817205905914307, "logps/rejected": -0.4375535249710083, "loss": 0.5491, "rewards/accuracies": 1.0, "rewards/chosen": 1.264578104019165, "rewards/margins": 0.3123939037322998, "rewards/rejected": 0.9521842002868652, "step": 5356 }, { "epoch": 2.89, "learning_rate": 1.8891409103458293e-08, "logits/chosen": -2.052748680114746, "logits/rejected": -2.3109376430511475, "logps/chosen": -3.801082134246826, "logps/rejected": -6.825252532958984, "loss": 0.6188, "rewards/accuracies": 1.0, "rewards/chosen": 1.2284307479858398, "rewards/margins": 0.15469396114349365, "rewards/rejected": 1.0737367868423462, "step": 5357 }, { "epoch": 2.89, "learning_rate": 1.8874315641689432e-08, "logits/chosen": -2.1846232414245605, "logits/rejected": -2.3051860332489014, "logps/chosen": -1.5755788087844849, "logps/rejected": -1.6339830160140991, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 1.0368841886520386, "rewards/margins": 0.012866020202636719, "rewards/rejected": 1.0240181684494019, "step": 5358 }, { "epoch": 2.89, "learning_rate": 1.8857228117350914e-08, "logits/chosen": -1.9811941385269165, "logits/rejected": -1.9803160429000854, "logps/chosen": -0.579695463180542, "logps/rejected": -3.1180038452148438, "loss": 0.554, "rewards/accuracies": 1.0, "rewards/chosen": 0.9437164664268494, "rewards/margins": 0.30086880922317505, "rewards/rejected": 0.6428476572036743, "step": 5359 }, { "epoch": 2.89, "learning_rate": 1.8840146533702277e-08, "logits/chosen": -2.1158711910247803, "logits/rejected": -2.1184144020080566, "logps/chosen": -0.18693995475769043, "logps/rejected": -6.3225908279418945, "loss": 0.3801, "rewards/accuracies": 1.0, "rewards/chosen": 0.997998058795929, "rewards/margins": 0.7712233662605286, "rewards/rejected": 0.2267746925354004, "step": 5360 }, { "epoch": 2.89, "learning_rate": 1.882307089400195e-08, "logits/chosen": -2.204375982284546, "logits/rejected": -2.200148105621338, "logps/chosen": -8.252336502075195, "logps/rejected": -0.6178700923919678, "loss": 0.6284, "rewards/accuracies": 1.0, "rewards/chosen": 0.8380516171455383, "rewards/margins": 0.1339513063430786, "rewards/rejected": 0.7041003108024597, "step": 5361 }, { "epoch": 2.89, "learning_rate": 1.8806001201507242e-08, "logits/chosen": -2.2343621253967285, "logits/rejected": -2.1656479835510254, "logps/chosen": -27.48328399658203, "logps/rejected": -6.522639751434326, "loss": 0.1869, "rewards/accuracies": 1.0, "rewards/chosen": 2.1017346382141113, "rewards/margins": 1.582220196723938, "rewards/rejected": 0.5195144414901733, "step": 5362 }, { "epoch": 2.89, "learning_rate": 1.878893745947429e-08, "logits/chosen": -2.0555450916290283, "logits/rejected": -2.053274154663086, "logps/chosen": -2.0629866123199463, "logps/rejected": -6.688637733459473, "loss": 0.3774, "rewards/accuracies": 1.0, "rewards/chosen": 1.0306203365325928, "rewards/margins": 0.7798770666122437, "rewards/rejected": 0.2507432997226715, "step": 5363 }, { "epoch": 2.89, "learning_rate": 1.8771879671158132e-08, "logits/chosen": -2.256908893585205, "logits/rejected": -2.2540931701660156, "logps/chosen": -2.7395083904266357, "logps/rejected": -6.0744171142578125, "loss": 0.4231, "rewards/accuracies": 1.0, "rewards/chosen": 0.9490727782249451, "rewards/margins": 0.6412087678909302, "rewards/rejected": 0.3078640103340149, "step": 5364 }, { "epoch": 2.89, "learning_rate": 1.8754827839812643e-08, "logits/chosen": -2.141310930252075, "logits/rejected": -2.216472864151001, "logps/chosen": -4.530174732208252, "logps/rejected": -25.002662658691406, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": 1.0143979787826538, "rewards/margins": 0.249381422996521, "rewards/rejected": 0.7650165557861328, "step": 5365 }, { "epoch": 2.89, "learning_rate": 1.8737781968690574e-08, "logits/chosen": -1.9856693744659424, "logits/rejected": -1.996527910232544, "logps/chosen": -3.0920321941375732, "logps/rejected": -5.081174850463867, "loss": 0.4037, "rewards/accuracies": 1.0, "rewards/chosen": 1.0784268379211426, "rewards/margins": 0.6983462572097778, "rewards/rejected": 0.38008061051368713, "step": 5366 }, { "epoch": 2.89, "learning_rate": 1.8720742061043543e-08, "logits/chosen": -2.0904312133789062, "logits/rejected": -2.282525062561035, "logps/chosen": -0.11060211807489395, "logps/rejected": -0.1151827871799469, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.929020881652832, "rewards/margins": -0.002677738666534424, "rewards/rejected": 0.9316986203193665, "step": 5367 }, { "epoch": 2.9, "learning_rate": 1.8703708120122023e-08, "logits/chosen": -2.15863299369812, "logits/rejected": -2.153294801712036, "logps/chosen": -2.1165239810943604, "logps/rejected": -4.622865676879883, "loss": 0.3272, "rewards/accuracies": 1.0, "rewards/chosen": 1.595296859741211, "rewards/margins": 0.9492104649543762, "rewards/rejected": 0.6460863947868347, "step": 5368 }, { "epoch": 2.9, "learning_rate": 1.8686680149175338e-08, "logits/chosen": -2.1010901927948, "logits/rejected": -2.4063103199005127, "logps/chosen": -18.202133178710938, "logps/rejected": -15.37959098815918, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 0.4033428132534027, "rewards/margins": 0.13866138458251953, "rewards/rejected": 0.2646814286708832, "step": 5369 }, { "epoch": 2.9, "learning_rate": 1.8669658151451712e-08, "logits/chosen": -2.0976274013519287, "logits/rejected": -2.1628456115722656, "logps/chosen": -2.5113844871520996, "logps/rejected": -13.745013236999512, "loss": 0.3675, "rewards/accuracies": 1.0, "rewards/chosen": 1.676862120628357, "rewards/margins": 0.8116968870162964, "rewards/rejected": 0.8651652336120605, "step": 5370 }, { "epoch": 2.9, "learning_rate": 1.865264213019819e-08, "logits/chosen": -2.2474751472473145, "logits/rejected": -2.376006841659546, "logps/chosen": -7.0919389724731445, "logps/rejected": -7.367206573486328, "loss": 0.6624, "rewards/accuracies": 1.0, "rewards/chosen": 1.4271682500839233, "rewards/margins": 0.06251823902130127, "rewards/rejected": 1.364650011062622, "step": 5371 }, { "epoch": 2.9, "learning_rate": 1.863563208866069e-08, "logits/chosen": -2.0734636783599854, "logits/rejected": -2.074611186981201, "logps/chosen": -1.9471899271011353, "logps/rejected": -1.8940086364746094, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": 1.399323582649231, "rewards/margins": 0.5662417411804199, "rewards/rejected": 0.833081841468811, "step": 5372 }, { "epoch": 2.9, "learning_rate": 1.8618628030083993e-08, "logits/chosen": -2.075343608856201, "logits/rejected": -2.075382709503174, "logps/chosen": -0.666786789894104, "logps/rejected": -5.109833240509033, "loss": 0.4051, "rewards/accuracies": 1.0, "rewards/chosen": 1.0676835775375366, "rewards/margins": 0.6943238377571106, "rewards/rejected": 0.373359739780426, "step": 5373 }, { "epoch": 2.9, "learning_rate": 1.8601629957711736e-08, "logits/chosen": -2.1285955905914307, "logits/rejected": -2.0133705139160156, "logps/chosen": -33.86699676513672, "logps/rejected": -4.371561050415039, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 2.7215416431427, "rewards/margins": 2.094057321548462, "rewards/rejected": 0.6274843215942383, "step": 5374 }, { "epoch": 2.9, "learning_rate": 1.8584637874786406e-08, "logits/chosen": -2.0275020599365234, "logits/rejected": -2.0421273708343506, "logps/chosen": -3.1119136810302734, "logps/rejected": -1.0174437761306763, "loss": 0.5914, "rewards/accuracies": 1.0, "rewards/chosen": 0.9119102358818054, "rewards/margins": 0.21500033140182495, "rewards/rejected": 0.6969099044799805, "step": 5375 }, { "epoch": 2.9, "learning_rate": 1.856765178454938e-08, "logits/chosen": -2.191080093383789, "logits/rejected": -2.350970506668091, "logps/chosen": -0.36681410670280457, "logps/rejected": -0.4244581460952759, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9553432464599609, "rewards/margins": 0.005687534809112549, "rewards/rejected": 0.9496557116508484, "step": 5376 }, { "epoch": 2.9, "learning_rate": 1.8550671690240832e-08, "logits/chosen": -2.1549124717712402, "logits/rejected": -2.156499147415161, "logps/chosen": -0.80513596534729, "logps/rejected": -3.171612501144409, "loss": 0.4958, "rewards/accuracies": 1.0, "rewards/chosen": 1.0407987833023071, "rewards/margins": 0.4435802698135376, "rewards/rejected": 0.5972185134887695, "step": 5377 }, { "epoch": 2.9, "learning_rate": 1.8533697595099847e-08, "logits/chosen": -2.0384695529937744, "logits/rejected": -2.0409631729125977, "logps/chosen": -5.021580219268799, "logps/rejected": -11.905308723449707, "loss": 0.3601, "rewards/accuracies": 1.0, "rewards/chosen": 1.773633360862732, "rewards/margins": 0.8359583616256714, "rewards/rejected": 0.9376749992370605, "step": 5378 }, { "epoch": 2.9, "learning_rate": 1.851672950236434e-08, "logits/chosen": -2.0952916145324707, "logits/rejected": -2.0772223472595215, "logps/chosen": -6.783487319946289, "logps/rejected": -2.080630302429199, "loss": 0.3344, "rewards/accuracies": 1.0, "rewards/chosen": 1.7151631116867065, "rewards/margins": 0.9236077070236206, "rewards/rejected": 0.7915554046630859, "step": 5379 }, { "epoch": 2.9, "learning_rate": 1.8499767415271073e-08, "logits/chosen": -2.1965675354003906, "logits/rejected": -2.3166141510009766, "logps/chosen": -1.2540425062179565, "logps/rejected": -1.350830316543579, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.8048030138015747, "rewards/margins": -0.006142795085906982, "rewards/rejected": 0.8109458088874817, "step": 5380 }, { "epoch": 2.9, "learning_rate": 1.8482811337055708e-08, "logits/chosen": -2.170779228210449, "logits/rejected": -2.1451501846313477, "logps/chosen": -19.24064064025879, "logps/rejected": -16.72468376159668, "loss": 0.2085, "rewards/accuracies": 1.0, "rewards/chosen": 2.1751248836517334, "rewards/margins": 1.4620200395584106, "rewards/rejected": 0.7131048440933228, "step": 5381 }, { "epoch": 2.9, "learning_rate": 1.846586127095272e-08, "logits/chosen": -2.169522762298584, "logits/rejected": -2.3231992721557617, "logps/chosen": -9.962397575378418, "logps/rejected": -10.625598907470703, "loss": 0.641, "rewards/accuracies": 1.0, "rewards/chosen": 1.2237298488616943, "rewards/margins": 0.10716640949249268, "rewards/rejected": 1.1165634393692017, "step": 5382 }, { "epoch": 2.9, "learning_rate": 1.8448917220195438e-08, "logits/chosen": -2.1620726585388184, "logits/rejected": -2.151668071746826, "logps/chosen": -8.062005043029785, "logps/rejected": -3.4392950534820557, "loss": 0.6413, "rewards/accuracies": 1.0, "rewards/chosen": 0.6629757285118103, "rewards/margins": 0.1065678596496582, "rewards/rejected": 0.5564078688621521, "step": 5383 }, { "epoch": 2.9, "learning_rate": 1.843197918801605e-08, "logits/chosen": -2.0514140129089355, "logits/rejected": -2.2459781169891357, "logps/chosen": -2.8340492248535156, "logps/rejected": -3.2318673133850098, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.8235287070274353, "rewards/margins": 0.028067469596862793, "rewards/rejected": 0.7954612374305725, "step": 5384 }, { "epoch": 2.9, "learning_rate": 1.8415047177645604e-08, "logits/chosen": -1.9371275901794434, "logits/rejected": -2.221076726913452, "logps/chosen": -0.3115256130695343, "logps/rejected": -0.28901901841163635, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9561932682991028, "rewards/margins": 0.015137851238250732, "rewards/rejected": 0.941055417060852, "step": 5385 }, { "epoch": 2.91, "learning_rate": 1.8398121192314e-08, "logits/chosen": -2.0318491458892822, "logits/rejected": -2.0336239337921143, "logps/chosen": -1.826852798461914, "logps/rejected": -8.686686515808105, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 0.9879249930381775, "rewards/margins": 0.8856133818626404, "rewards/rejected": 0.10231161117553711, "step": 5386 }, { "epoch": 2.91, "learning_rate": 1.838120123524996e-08, "logits/chosen": -2.0959537029266357, "logits/rejected": -2.091726779937744, "logps/chosen": -3.5251219272613525, "logps/rejected": -4.559375286102295, "loss": 0.4796, "rewards/accuracies": 1.0, "rewards/chosen": 1.0426315069198608, "rewards/margins": 0.48534566164016724, "rewards/rejected": 0.5572858452796936, "step": 5387 }, { "epoch": 2.91, "learning_rate": 1.83642873096811e-08, "logits/chosen": -2.1171951293945312, "logits/rejected": -2.2948837280273438, "logps/chosen": -2.2003161907196045, "logps/rejected": -2.019287109375, "loss": 0.7027, "rewards/accuracies": 0.0, "rewards/chosen": 1.1026597023010254, "rewards/margins": -0.01894402503967285, "rewards/rejected": 1.1216037273406982, "step": 5388 }, { "epoch": 2.91, "learning_rate": 1.8347379418833848e-08, "logits/chosen": -1.9710627794265747, "logits/rejected": -2.284942626953125, "logps/chosen": -0.13021071255207062, "logps/rejected": -0.1481279879808426, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9519834518432617, "rewards/margins": 0.013598799705505371, "rewards/rejected": 0.9383846521377563, "step": 5389 }, { "epoch": 2.91, "learning_rate": 1.8330477565933506e-08, "logits/chosen": -2.1488125324249268, "logits/rejected": -2.1463592052459717, "logps/chosen": -4.685701370239258, "logps/rejected": -7.497609615325928, "loss": 0.4299, "rewards/accuracies": 1.0, "rewards/chosen": 1.0603491067886353, "rewards/margins": 0.6214210987091064, "rewards/rejected": 0.4389279782772064, "step": 5390 }, { "epoch": 2.91, "learning_rate": 1.831358175420421e-08, "logits/chosen": -2.0490212440490723, "logits/rejected": -2.3242745399475098, "logps/chosen": -2.4704136848449707, "logps/rejected": -3.6118855476379395, "loss": 0.7063, "rewards/accuracies": 0.0, "rewards/chosen": 1.0169334411621094, "rewards/margins": -0.026040077209472656, "rewards/rejected": 1.042973518371582, "step": 5391 }, { "epoch": 2.91, "learning_rate": 1.8296691986868944e-08, "logits/chosen": -2.126250982284546, "logits/rejected": -2.166067361831665, "logps/chosen": -4.148955345153809, "logps/rejected": -8.489442825317383, "loss": 0.5477, "rewards/accuracies": 1.0, "rewards/chosen": 1.2693276405334473, "rewards/margins": 0.3155951499938965, "rewards/rejected": 0.9537324905395508, "step": 5392 }, { "epoch": 2.91, "learning_rate": 1.8279808267149553e-08, "logits/chosen": -2.0070576667785645, "logits/rejected": -2.0074710845947266, "logps/chosen": -0.16722284257411957, "logps/rejected": -9.57083797454834, "loss": 0.3189, "rewards/accuracies": 1.0, "rewards/chosen": 1.0653705596923828, "rewards/margins": 0.9793060421943665, "rewards/rejected": 0.08606453239917755, "step": 5393 }, { "epoch": 2.91, "learning_rate": 1.8262930598266713e-08, "logits/chosen": -2.013159990310669, "logits/rejected": -2.2787694931030273, "logps/chosen": -0.6911078095436096, "logps/rejected": -0.6915491223335266, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.8898982405662537, "rewards/margins": 0.013420701026916504, "rewards/rejected": 0.8764775395393372, "step": 5394 }, { "epoch": 2.91, "learning_rate": 1.824605898343995e-08, "logits/chosen": -2.214611053466797, "logits/rejected": -2.1943602561950684, "logps/chosen": -4.622652053833008, "logps/rejected": -9.17238998413086, "loss": 0.4017, "rewards/accuracies": 1.0, "rewards/chosen": 1.0862010717391968, "rewards/margins": 0.7045271396636963, "rewards/rejected": 0.3816739022731781, "step": 5395 }, { "epoch": 2.91, "learning_rate": 1.8229193425887636e-08, "logits/chosen": -2.132624626159668, "logits/rejected": -2.3146321773529053, "logps/chosen": -0.92486572265625, "logps/rejected": -0.9665011167526245, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 1.033359408378601, "rewards/margins": 0.026118874549865723, "rewards/rejected": 1.0072405338287354, "step": 5396 }, { "epoch": 2.91, "learning_rate": 1.8212333928826983e-08, "logits/chosen": -2.162189245223999, "logits/rejected": -2.177109718322754, "logps/chosen": -3.491665840148926, "logps/rejected": -2.3102188110351562, "loss": 0.6345, "rewards/accuracies": 1.0, "rewards/chosen": 1.2735812664031982, "rewards/margins": 0.12096405029296875, "rewards/rejected": 1.1526172161102295, "step": 5397 }, { "epoch": 2.91, "learning_rate": 1.8195480495474063e-08, "logits/chosen": -2.0521762371063232, "logits/rejected": -2.0448265075683594, "logps/chosen": -2.384857177734375, "logps/rejected": -7.811329364776611, "loss": 0.4047, "rewards/accuracies": 1.0, "rewards/chosen": 0.9064704775810242, "rewards/margins": 0.6955366134643555, "rewards/rejected": 0.2109338343143463, "step": 5398 }, { "epoch": 2.91, "learning_rate": 1.8178633129043774e-08, "logits/chosen": -2.1623430252075195, "logits/rejected": -2.0931389331817627, "logps/chosen": -26.115697860717773, "logps/rejected": -7.908947467803955, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": 2.5632009506225586, "rewards/margins": 1.6552824974060059, "rewards/rejected": 0.907918393611908, "step": 5399 }, { "epoch": 2.91, "learning_rate": 1.816179183274986e-08, "logits/chosen": -2.0117034912109375, "logits/rejected": -2.02742600440979, "logps/chosen": -3.798628807067871, "logps/rejected": -6.552216529846191, "loss": 0.5709, "rewards/accuracies": 1.0, "rewards/chosen": 0.9269576072692871, "rewards/margins": 0.2616569399833679, "rewards/rejected": 0.6653006672859192, "step": 5400 }, { "epoch": 2.91, "learning_rate": 1.8144956609804917e-08, "logits/chosen": -2.058711051940918, "logits/rejected": -2.282078981399536, "logps/chosen": -0.1475793421268463, "logps/rejected": -0.14885284006595612, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9550237059593201, "rewards/margins": 0.004839181900024414, "rewards/rejected": 0.9501845240592957, "step": 5401 }, { "epoch": 2.91, "learning_rate": 1.8128127463420357e-08, "logits/chosen": -2.086679697036743, "logits/rejected": -2.133329153060913, "logps/chosen": -4.059080123901367, "logps/rejected": -25.07036590576172, "loss": 0.1911, "rewards/accuracies": 1.0, "rewards/chosen": 1.2931703329086304, "rewards/margins": 1.5577224493026733, "rewards/rejected": -0.26455211639404297, "step": 5402 }, { "epoch": 2.91, "learning_rate": 1.8111304396806478e-08, "logits/chosen": -2.047667980194092, "logits/rejected": -2.055694580078125, "logps/chosen": -4.705686092376709, "logps/rejected": -4.066931247711182, "loss": 0.3608, "rewards/accuracies": 1.0, "rewards/chosen": 1.4243799448013306, "rewards/margins": 0.8337486982345581, "rewards/rejected": 0.5906312465667725, "step": 5403 }, { "epoch": 2.91, "learning_rate": 1.8094487413172378e-08, "logits/chosen": -2.1412570476531982, "logits/rejected": -2.1362528800964355, "logps/chosen": -1.7849297523498535, "logps/rejected": -3.7069926261901855, "loss": 0.7733, "rewards/accuracies": 0.0, "rewards/chosen": 0.8356891870498657, "rewards/margins": -0.15427875518798828, "rewards/rejected": 0.989967942237854, "step": 5404 }, { "epoch": 2.92, "learning_rate": 1.807767651572601e-08, "logits/chosen": -2.031341552734375, "logits/rejected": -2.323065996170044, "logps/chosen": -0.6241411566734314, "logps/rejected": -0.7537766695022583, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.8074411749839783, "rewards/margins": 0.032295942306518555, "rewards/rejected": 0.7751452326774597, "step": 5405 }, { "epoch": 2.92, "learning_rate": 1.8060871707674174e-08, "logits/chosen": -2.0177855491638184, "logits/rejected": -2.0111982822418213, "logps/chosen": -2.7636778354644775, "logps/rejected": -6.866523742675781, "loss": 0.3916, "rewards/accuracies": 1.0, "rewards/chosen": 0.9236626029014587, "rewards/margins": 0.7352589964866638, "rewards/rejected": 0.18840360641479492, "step": 5406 }, { "epoch": 2.92, "learning_rate": 1.8044072992222477e-08, "logits/chosen": -2.0371012687683105, "logits/rejected": -2.3110756874084473, "logps/chosen": -0.19045591354370117, "logps/rejected": -0.2168840616941452, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 1.0505651235580444, "rewards/margins": 0.018070220947265625, "rewards/rejected": 1.0324949026107788, "step": 5407 }, { "epoch": 2.92, "learning_rate": 1.8027280372575386e-08, "logits/chosen": -2.065608263015747, "logits/rejected": -2.0595529079437256, "logps/chosen": -0.5703201293945312, "logps/rejected": -6.294083118438721, "loss": 0.3072, "rewards/accuracies": 1.0, "rewards/chosen": 1.363331913948059, "rewards/margins": 1.0227453708648682, "rewards/rejected": 0.34058651328086853, "step": 5408 }, { "epoch": 2.92, "learning_rate": 1.801049385193622e-08, "logits/chosen": -2.0351202487945557, "logits/rejected": -2.279087781906128, "logps/chosen": -0.4774461090564728, "logps/rejected": -0.4649086594581604, "loss": 0.6673, "rewards/accuracies": 1.0, "rewards/chosen": 0.955705463886261, "rewards/margins": 0.05229663848876953, "rewards/rejected": 0.9034088253974915, "step": 5409 }, { "epoch": 2.92, "learning_rate": 1.799371343350711e-08, "logits/chosen": -2.2081692218780518, "logits/rejected": -2.3275022506713867, "logps/chosen": -3.8273746967315674, "logps/rejected": -1.2748533487319946, "loss": 0.774, "rewards/accuracies": 0.0, "rewards/chosen": 0.7123458981513977, "rewards/margins": -0.15568768978118896, "rewards/rejected": 0.8680335879325867, "step": 5410 }, { "epoch": 2.92, "learning_rate": 1.7976939120489037e-08, "logits/chosen": -2.1473548412323, "logits/rejected": -2.1521291732788086, "logps/chosen": -2.1780428886413574, "logps/rejected": -15.08390998840332, "loss": 0.2343, "rewards/accuracies": 1.0, "rewards/chosen": 1.5658222436904907, "rewards/margins": 1.3315550088882446, "rewards/rejected": 0.2342672348022461, "step": 5411 }, { "epoch": 2.92, "learning_rate": 1.7960170916081807e-08, "logits/chosen": -2.022611141204834, "logits/rejected": -2.2903246879577637, "logps/chosen": -3.453751802444458, "logps/rejected": -2.9815077781677246, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.5819725394248962, "rewards/margins": -0.002529919147491455, "rewards/rejected": 0.5845024585723877, "step": 5412 }, { "epoch": 2.92, "learning_rate": 1.7943408823484053e-08, "logits/chosen": -2.0030174255371094, "logits/rejected": -2.010864496231079, "logps/chosen": -2.400820732116699, "logps/rejected": -3.4470431804656982, "loss": 0.3946, "rewards/accuracies": 1.0, "rewards/chosen": 1.295121669769287, "rewards/margins": 0.7262020707130432, "rewards/rejected": 0.5689195990562439, "step": 5413 }, { "epoch": 2.92, "learning_rate": 1.7926652845893285e-08, "logits/chosen": -2.082885503768921, "logits/rejected": -1.9755431413650513, "logps/chosen": -22.499614715576172, "logps/rejected": -9.083577156066895, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 2.3377997875213623, "rewards/margins": 2.1262550354003906, "rewards/rejected": 0.2115447074174881, "step": 5414 }, { "epoch": 2.92, "learning_rate": 1.7909902986505804e-08, "logits/chosen": -2.1131253242492676, "logits/rejected": -2.119220733642578, "logps/chosen": -1.9030711650848389, "logps/rejected": -4.031601428985596, "loss": 0.4538, "rewards/accuracies": 1.0, "rewards/chosen": 1.1485049724578857, "rewards/margins": 0.5546515583992004, "rewards/rejected": 0.5938534140586853, "step": 5415 }, { "epoch": 2.92, "learning_rate": 1.7893159248516748e-08, "logits/chosen": -1.9496797323226929, "logits/rejected": -1.94883394241333, "logps/chosen": -0.34875330328941345, "logps/rejected": -2.4085865020751953, "loss": 0.6317, "rewards/accuracies": 1.0, "rewards/chosen": 0.8937360644340515, "rewards/margins": 0.12689578533172607, "rewards/rejected": 0.7668402791023254, "step": 5416 }, { "epoch": 2.92, "learning_rate": 1.787642163512011e-08, "logits/chosen": -2.249363422393799, "logits/rejected": -2.134171485900879, "logps/chosen": -25.119647979736328, "logps/rejected": -5.605460166931152, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 2.6560630798339844, "rewards/margins": 2.281039237976074, "rewards/rejected": 0.3750239312648773, "step": 5417 }, { "epoch": 2.92, "learning_rate": 1.7859690149508693e-08, "logits/chosen": -2.04002046585083, "logits/rejected": -2.0426878929138184, "logps/chosen": -0.2141350358724594, "logps/rejected": -3.8235294818878174, "loss": 0.4998, "rewards/accuracies": 1.0, "rewards/chosen": 1.043664813041687, "rewards/margins": 0.43321841955184937, "rewards/rejected": 0.6104463934898376, "step": 5418 }, { "epoch": 2.92, "learning_rate": 1.7842964794874132e-08, "logits/chosen": -2.061816930770874, "logits/rejected": -2.3737242221832275, "logps/chosen": -0.3134375810623169, "logps/rejected": -0.3377777934074402, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.8502232432365417, "rewards/margins": 0.014485597610473633, "rewards/rejected": 0.8357376456260681, "step": 5419 }, { "epoch": 2.92, "learning_rate": 1.7826245574406912e-08, "logits/chosen": -2.112913131713867, "logits/rejected": -2.3139312267303467, "logps/chosen": -1.880052089691162, "logps/rejected": -0.6859195828437805, "loss": 0.6675, "rewards/accuracies": 1.0, "rewards/chosen": 0.9095478057861328, "rewards/margins": 0.05194348096847534, "rewards/rejected": 0.8576043248176575, "step": 5420 }, { "epoch": 2.92, "learning_rate": 1.780953249129633e-08, "logits/chosen": -2.1265268325805664, "logits/rejected": -2.1190316677093506, "logps/chosen": -1.5172115564346313, "logps/rejected": -4.141753673553467, "loss": 0.47, "rewards/accuracies": 1.0, "rewards/chosen": 1.3561071157455444, "rewards/margins": 0.5108354091644287, "rewards/rejected": 0.8452717065811157, "step": 5421 }, { "epoch": 2.92, "learning_rate": 1.7792825548730512e-08, "logits/chosen": -2.0481982231140137, "logits/rejected": -2.292954444885254, "logps/chosen": -0.9266796112060547, "logps/rejected": -1.0243622064590454, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.7893818020820618, "rewards/margins": 0.03426969051361084, "rewards/rejected": 0.7551121115684509, "step": 5422 }, { "epoch": 2.93, "learning_rate": 1.7776124749896436e-08, "logits/chosen": -2.173109531402588, "logits/rejected": -2.1785449981689453, "logps/chosen": -4.036367416381836, "logps/rejected": -5.641282558441162, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 1.0722559690475464, "rewards/margins": 0.6929939389228821, "rewards/rejected": 0.3792620301246643, "step": 5423 }, { "epoch": 2.93, "learning_rate": 1.7759430097979866e-08, "logits/chosen": -1.9624760150909424, "logits/rejected": -2.2510998249053955, "logps/chosen": -0.867906928062439, "logps/rejected": -0.728439450263977, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 1.011905550956726, "rewards/margins": 0.03380495309829712, "rewards/rejected": 0.978100597858429, "step": 5424 }, { "epoch": 2.93, "learning_rate": 1.7742741596165405e-08, "logits/chosen": -2.2105071544647217, "logits/rejected": -2.2035810947418213, "logps/chosen": -5.854797840118408, "logps/rejected": -8.135891914367676, "loss": 0.2424, "rewards/accuracies": 1.0, "rewards/chosen": 1.4079288244247437, "rewards/margins": 1.293350338935852, "rewards/rejected": 0.11457844078540802, "step": 5425 }, { "epoch": 2.93, "learning_rate": 1.772605924763654e-08, "logits/chosen": -2.0507309436798096, "logits/rejected": -2.3010942935943604, "logps/chosen": -0.17460516095161438, "logps/rejected": -0.20614543557167053, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 1.0441457033157349, "rewards/margins": -0.0019479990005493164, "rewards/rejected": 1.0460937023162842, "step": 5426 }, { "epoch": 2.93, "learning_rate": 1.7709383055575512e-08, "logits/chosen": -2.0959413051605225, "logits/rejected": -2.332012414932251, "logps/chosen": -1.609156608581543, "logps/rejected": -1.697502851486206, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 0.8562578558921814, "rewards/margins": 0.03629302978515625, "rewards/rejected": 0.8199648261070251, "step": 5427 }, { "epoch": 2.93, "learning_rate": 1.769271302316342e-08, "logits/chosen": -2.0594074726104736, "logits/rejected": -2.0538997650146484, "logps/chosen": -3.0291459560394287, "logps/rejected": -4.938234329223633, "loss": 0.3492, "rewards/accuracies": 1.0, "rewards/chosen": 1.4175846576690674, "rewards/margins": 0.8724911212921143, "rewards/rejected": 0.5450935363769531, "step": 5428 }, { "epoch": 2.93, "learning_rate": 1.7676049153580187e-08, "logits/chosen": -2.144023895263672, "logits/rejected": -2.1399307250976562, "logps/chosen": -4.216541290283203, "logps/rejected": -2.9389259815216064, "loss": 0.2841, "rewards/accuracies": 1.0, "rewards/chosen": 1.7163225412368774, "rewards/margins": 1.1130982637405396, "rewards/rejected": 0.6032242774963379, "step": 5429 }, { "epoch": 2.93, "learning_rate": 1.7659391450004546e-08, "logits/chosen": -2.176300287246704, "logits/rejected": -2.3039629459381104, "logps/chosen": -18.509395599365234, "logps/rejected": -1.6800408363342285, "loss": 0.6512, "rewards/accuracies": 1.0, "rewards/chosen": 1.1283234357833862, "rewards/margins": 0.08565163612365723, "rewards/rejected": 1.042671799659729, "step": 5430 }, { "epoch": 2.93, "learning_rate": 1.7642739915614076e-08, "logits/chosen": -2.0451173782348633, "logits/rejected": -2.033264398574829, "logps/chosen": -5.16418981552124, "logps/rejected": -8.193187713623047, "loss": 0.2892, "rewards/accuracies": 1.0, "rewards/chosen": 1.622054100036621, "rewards/margins": 1.092413306236267, "rewards/rejected": 0.529640793800354, "step": 5431 }, { "epoch": 2.93, "learning_rate": 1.7626094553585158e-08, "logits/chosen": -2.0924086570739746, "logits/rejected": -2.3168387413024902, "logps/chosen": -0.3378276526927948, "logps/rejected": -0.4010401666164398, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.9245262145996094, "rewards/margins": 0.038633525371551514, "rewards/rejected": 0.8858926892280579, "step": 5432 }, { "epoch": 2.93, "learning_rate": 1.7609455367093007e-08, "logits/chosen": -2.0815768241882324, "logits/rejected": -2.088461399078369, "logps/chosen": -1.4326419830322266, "logps/rejected": -3.1146562099456787, "loss": 0.3863, "rewards/accuracies": 1.0, "rewards/chosen": 1.454048991203308, "rewards/margins": 0.7517991662025452, "rewards/rejected": 0.7022498250007629, "step": 5433 }, { "epoch": 2.93, "learning_rate": 1.7592822359311664e-08, "logits/chosen": -1.9929813146591187, "logits/rejected": -1.9929052591323853, "logps/chosen": -1.0195034742355347, "logps/rejected": -1.0605652332305908, "loss": 0.5443, "rewards/accuracies": 1.0, "rewards/chosen": 1.1930978298187256, "rewards/margins": 0.3237094283103943, "rewards/rejected": 0.8693884015083313, "step": 5434 }, { "epoch": 2.93, "learning_rate": 1.757619553341398e-08, "logits/chosen": -2.0681068897247314, "logits/rejected": -2.3150272369384766, "logps/chosen": -1.5284643173217773, "logps/rejected": -5.136446952819824, "loss": 0.6272, "rewards/accuracies": 1.0, "rewards/chosen": 1.0462913513183594, "rewards/margins": 0.13664579391479492, "rewards/rejected": 0.9096455574035645, "step": 5435 }, { "epoch": 2.93, "learning_rate": 1.7559574892571632e-08, "logits/chosen": -1.9520113468170166, "logits/rejected": -1.9513888359069824, "logps/chosen": -0.8378735780715942, "logps/rejected": -1.153509259223938, "loss": 0.6557, "rewards/accuracies": 1.0, "rewards/chosen": 0.8565242886543274, "rewards/margins": 0.07642173767089844, "rewards/rejected": 0.780102550983429, "step": 5436 }, { "epoch": 2.93, "learning_rate": 1.7542960439955124e-08, "logits/chosen": -2.012068271636963, "logits/rejected": -2.284228801727295, "logps/chosen": -0.29104143381118774, "logps/rejected": -0.3332696259021759, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.8609600067138672, "rewards/margins": 0.0022292733192443848, "rewards/rejected": 0.8587307333946228, "step": 5437 }, { "epoch": 2.93, "learning_rate": 1.752635217873376e-08, "logits/chosen": -2.1201038360595703, "logits/rejected": -2.311138391494751, "logps/chosen": -1.235986351966858, "logps/rejected": -1.276052474975586, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.6032595038414001, "rewards/margins": 0.004262566566467285, "rewards/rejected": 0.5989969372749329, "step": 5438 }, { "epoch": 2.93, "learning_rate": 1.750975011207569e-08, "logits/chosen": -2.131157875061035, "logits/rejected": -2.1295244693756104, "logps/chosen": -0.4833362400531769, "logps/rejected": -2.633164882659912, "loss": 0.5932, "rewards/accuracies": 1.0, "rewards/chosen": 0.9964948892593384, "rewards/margins": 0.21089577674865723, "rewards/rejected": 0.7855991125106812, "step": 5439 }, { "epoch": 2.93, "learning_rate": 1.7493154243147858e-08, "logits/chosen": -2.042428970336914, "logits/rejected": -2.079232931137085, "logps/chosen": -0.9847121834754944, "logps/rejected": -10.978239059448242, "loss": 0.26, "rewards/accuracies": 1.0, "rewards/chosen": 1.5743029117584229, "rewards/margins": 1.2144733667373657, "rewards/rejected": 0.35982951521873474, "step": 5440 }, { "epoch": 2.93, "learning_rate": 1.7476564575116037e-08, "logits/chosen": -2.198052406311035, "logits/rejected": -2.200676679611206, "logps/chosen": -1.3312493562698364, "logps/rejected": -1.0226457118988037, "loss": 0.6038, "rewards/accuracies": 1.0, "rewards/chosen": 1.2386997938156128, "rewards/margins": 0.18743598461151123, "rewards/rejected": 1.0512638092041016, "step": 5441 }, { "epoch": 2.94, "learning_rate": 1.7459981111144824e-08, "logits/chosen": -2.091477394104004, "logits/rejected": -2.243727922439575, "logps/chosen": -6.481412887573242, "logps/rejected": -4.568808555603027, "loss": 0.5918, "rewards/accuracies": 1.0, "rewards/chosen": 1.1095103025436401, "rewards/margins": 0.21403807401657104, "rewards/rejected": 0.8954722285270691, "step": 5442 }, { "epoch": 2.94, "learning_rate": 1.7443403854397616e-08, "logits/chosen": -2.0004794597625732, "logits/rejected": -1.9987800121307373, "logps/chosen": -1.143670916557312, "logps/rejected": -6.425379276275635, "loss": 0.3781, "rewards/accuracies": 1.0, "rewards/chosen": 1.016832709312439, "rewards/margins": 0.7775530219078064, "rewards/rejected": 0.23927970230579376, "step": 5443 }, { "epoch": 2.94, "learning_rate": 1.7426832808036633e-08, "logits/chosen": -2.05774188041687, "logits/rejected": -2.2875428199768066, "logps/chosen": -0.28681260347366333, "logps/rejected": -0.27636533975601196, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": 0.8660603761672974, "rewards/margins": -0.008192121982574463, "rewards/rejected": 0.8742524981498718, "step": 5444 }, { "epoch": 2.94, "learning_rate": 1.741026797522292e-08, "logits/chosen": -2.1050074100494385, "logits/rejected": -2.108644723892212, "logps/chosen": -3.1080706119537354, "logps/rejected": -0.8873323798179626, "loss": 0.6017, "rewards/accuracies": 1.0, "rewards/chosen": 1.0323598384857178, "rewards/margins": 0.19211989641189575, "rewards/rejected": 0.840239942073822, "step": 5445 }, { "epoch": 2.94, "learning_rate": 1.739370935911632e-08, "logits/chosen": -2.13482666015625, "logits/rejected": -2.29923152923584, "logps/chosen": -0.15367266535758972, "logps/rejected": -0.17766150832176208, "loss": 0.6996, "rewards/accuracies": 0.0, "rewards/chosen": 0.9293123483657837, "rewards/margins": -0.01281362771987915, "rewards/rejected": 0.9421259760856628, "step": 5446 }, { "epoch": 2.94, "learning_rate": 1.7377156962875493e-08, "logits/chosen": -2.0503883361816406, "logits/rejected": -2.0385513305664062, "logps/chosen": -5.041879177093506, "logps/rejected": -1.1284525394439697, "loss": 0.3337, "rewards/accuracies": 1.0, "rewards/chosen": 1.7166742086410522, "rewards/margins": 0.9261365532875061, "rewards/rejected": 0.7905376553535461, "step": 5447 }, { "epoch": 2.94, "learning_rate": 1.7360610789657944e-08, "logits/chosen": -2.014224052429199, "logits/rejected": -2.014889717102051, "logps/chosen": -0.15007884800434113, "logps/rejected": -6.750406265258789, "loss": 0.4413, "rewards/accuracies": 1.0, "rewards/chosen": 0.8996354341506958, "rewards/margins": 0.5891494750976562, "rewards/rejected": 0.31048592925071716, "step": 5448 }, { "epoch": 2.94, "learning_rate": 1.7344070842619944e-08, "logits/chosen": -2.115396022796631, "logits/rejected": -2.1080617904663086, "logps/chosen": -0.5663115382194519, "logps/rejected": -4.561976432800293, "loss": 0.5438, "rewards/accuracies": 1.0, "rewards/chosen": 0.9465241432189941, "rewards/margins": 0.32488298416137695, "rewards/rejected": 0.6216411590576172, "step": 5449 }, { "epoch": 2.94, "learning_rate": 1.7327537124916607e-08, "logits/chosen": -2.1408445835113525, "logits/rejected": -2.1229913234710693, "logps/chosen": -1.2384496927261353, "logps/rejected": -8.9524507522583, "loss": 0.4215, "rewards/accuracies": 1.0, "rewards/chosen": 1.2457106113433838, "rewards/margins": 0.6456683278083801, "rewards/rejected": 0.6000422835350037, "step": 5450 }, { "epoch": 2.94, "learning_rate": 1.7311009639701845e-08, "logits/chosen": -2.0665555000305176, "logits/rejected": -2.0377023220062256, "logps/chosen": -12.767581939697266, "logps/rejected": -2.7469615936279297, "loss": 0.2521, "rewards/accuracies": 1.0, "rewards/chosen": 1.7438219785690308, "rewards/margins": 1.2491867542266846, "rewards/rejected": 0.4946351945400238, "step": 5451 }, { "epoch": 2.94, "learning_rate": 1.729448839012838e-08, "logits/chosen": -2.1980531215667725, "logits/rejected": -2.274366855621338, "logps/chosen": -0.20558016002178192, "logps/rejected": -0.226771280169487, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8881670236587524, "rewards/margins": 0.0021303296089172363, "rewards/rejected": 0.8860366940498352, "step": 5452 }, { "epoch": 2.94, "learning_rate": 1.7277973379347777e-08, "logits/chosen": -2.0412161350250244, "logits/rejected": -2.0323753356933594, "logps/chosen": -12.517046928405762, "logps/rejected": -5.814582824707031, "loss": 0.3109, "rewards/accuracies": 1.0, "rewards/chosen": 1.7595303058624268, "rewards/margins": 1.0089929103851318, "rewards/rejected": 0.7505373954772949, "step": 5453 }, { "epoch": 2.94, "learning_rate": 1.7261464610510346e-08, "logits/chosen": -2.040497303009033, "logits/rejected": -2.040076732635498, "logps/chosen": -0.9356251955032349, "logps/rejected": -4.458934783935547, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 0.9651663899421692, "rewards/margins": 0.5633840560913086, "rewards/rejected": 0.4017823338508606, "step": 5454 }, { "epoch": 2.94, "learning_rate": 1.7244962086765252e-08, "logits/chosen": -2.083660364151001, "logits/rejected": -2.294424533843994, "logps/chosen": -0.18548721075057983, "logps/rejected": -0.1815381646156311, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.8799285888671875, "rewards/margins": 0.023432791233062744, "rewards/rejected": 0.8564957976341248, "step": 5455 }, { "epoch": 2.94, "learning_rate": 1.7228465811260473e-08, "logits/chosen": -2.0760343074798584, "logits/rejected": -2.229255437850952, "logps/chosen": -0.28446483612060547, "logps/rejected": -0.2667825222015381, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": 0.8369361162185669, "rewards/margins": -0.009367167949676514, "rewards/rejected": 0.8463032841682434, "step": 5456 }, { "epoch": 2.94, "learning_rate": 1.7211975787142775e-08, "logits/chosen": -2.17162823677063, "logits/rejected": -2.3323750495910645, "logps/chosen": -0.1677963137626648, "logps/rejected": -0.1936827450990677, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9523094296455383, "rewards/margins": 0.013501286506652832, "rewards/rejected": 0.9388081431388855, "step": 5457 }, { "epoch": 2.94, "learning_rate": 1.7195492017557727e-08, "logits/chosen": -2.0394694805145264, "logits/rejected": -2.3476247787475586, "logps/chosen": -8.387266159057617, "logps/rejected": -8.73059368133545, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": 0.5057486891746521, "rewards/margins": 0.036734044551849365, "rewards/rejected": 0.46901464462280273, "step": 5458 }, { "epoch": 2.94, "learning_rate": 1.7179014505649754e-08, "logits/chosen": -2.078016519546509, "logits/rejected": -2.294036388397217, "logps/chosen": -7.405279159545898, "logps/rejected": -7.491024971008301, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.6518598794937134, "rewards/margins": 0.019899487495422363, "rewards/rejected": 0.631960391998291, "step": 5459 }, { "epoch": 2.94, "learning_rate": 1.7162543254562017e-08, "logits/chosen": -2.056865692138672, "logits/rejected": -2.0520172119140625, "logps/chosen": -2.1392247676849365, "logps/rejected": -7.1808180809021, "loss": 0.4185, "rewards/accuracies": 1.0, "rewards/chosen": 1.2849719524383545, "rewards/margins": 0.6545467376708984, "rewards/rejected": 0.630425214767456, "step": 5460 }, { "epoch": 2.95, "learning_rate": 1.7146078267436537e-08, "logits/chosen": -2.1079282760620117, "logits/rejected": -2.2495598793029785, "logps/chosen": -3.6458654403686523, "logps/rejected": -3.763437509536743, "loss": 0.6672, "rewards/accuracies": 1.0, "rewards/chosen": 0.5880733728408813, "rewards/margins": 0.052555859088897705, "rewards/rejected": 0.5355175137519836, "step": 5461 }, { "epoch": 2.95, "learning_rate": 1.7129619547414103e-08, "logits/chosen": -2.0985207557678223, "logits/rejected": -2.1085946559906006, "logps/chosen": -0.8442931175231934, "logps/rejected": -12.501784324645996, "loss": 0.4581, "rewards/accuracies": 1.0, "rewards/chosen": 0.9576488733291626, "rewards/margins": 0.5427556037902832, "rewards/rejected": 0.414893239736557, "step": 5462 }, { "epoch": 2.95, "learning_rate": 1.711316709763434e-08, "logits/chosen": -1.993179202079773, "logits/rejected": -1.993066430091858, "logps/chosen": -0.5137110352516174, "logps/rejected": -2.5877439975738525, "loss": 0.5968, "rewards/accuracies": 1.0, "rewards/chosen": 1.0034053325653076, "rewards/margins": 0.2030678391456604, "rewards/rejected": 0.8003374934196472, "step": 5463 }, { "epoch": 2.95, "learning_rate": 1.7096720921235648e-08, "logits/chosen": -2.1435112953186035, "logits/rejected": -2.144869327545166, "logps/chosen": -0.5982319116592407, "logps/rejected": -8.832700729370117, "loss": 0.3833, "rewards/accuracies": 1.0, "rewards/chosen": 0.9512181282043457, "rewards/margins": 0.7610517740249634, "rewards/rejected": 0.1901663839817047, "step": 5464 }, { "epoch": 2.95, "learning_rate": 1.7080281021355254e-08, "logits/chosen": -2.071253538131714, "logits/rejected": -2.3191757202148438, "logps/chosen": -1.7787922620773315, "logps/rejected": -1.1107478141784668, "loss": 0.725, "rewards/accuracies": 0.0, "rewards/chosen": 0.6713709831237793, "rewards/margins": -0.0628131628036499, "rewards/rejected": 0.7341841459274292, "step": 5465 }, { "epoch": 2.95, "learning_rate": 1.7063847401129165e-08, "logits/chosen": -2.0786054134368896, "logits/rejected": -2.0864431858062744, "logps/chosen": -0.350326269865036, "logps/rejected": -8.628954887390137, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 1.1874607801437378, "rewards/margins": 0.8644829392433167, "rewards/rejected": 0.32297784090042114, "step": 5466 }, { "epoch": 2.95, "learning_rate": 1.704742006369222e-08, "logits/chosen": -2.025585889816284, "logits/rejected": -2.3068952560424805, "logps/chosen": -0.5545405745506287, "logps/rejected": -0.6607466340065002, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.8640134930610657, "rewards/margins": 0.03586149215698242, "rewards/rejected": 0.8281520009040833, "step": 5467 }, { "epoch": 2.95, "learning_rate": 1.7030999012178026e-08, "logits/chosen": -2.1709609031677246, "logits/rejected": -2.32025408744812, "logps/chosen": -0.653700590133667, "logps/rejected": -0.657974123954773, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9236102104187012, "rewards/margins": 0.017110168933868408, "rewards/rejected": 0.9065000414848328, "step": 5468 }, { "epoch": 2.95, "learning_rate": 1.701458424971902e-08, "logits/chosen": -2.044224262237549, "logits/rejected": -2.2959206104278564, "logps/chosen": -0.21867062151432037, "logps/rejected": -0.2240142673254013, "loss": 0.6673, "rewards/accuracies": 1.0, "rewards/chosen": 1.0057085752487183, "rewards/margins": 0.0524788498878479, "rewards/rejected": 0.9532297253608704, "step": 5469 }, { "epoch": 2.95, "learning_rate": 1.6998175779446422e-08, "logits/chosen": -1.9813488721847534, "logits/rejected": -2.267190933227539, "logps/chosen": -2.171010732650757, "logps/rejected": -2.2821884155273438, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.7150923013687134, "rewards/margins": 0.036709606647491455, "rewards/rejected": 0.6783826947212219, "step": 5470 }, { "epoch": 2.95, "learning_rate": 1.698177360449025e-08, "logits/chosen": -2.072598695755005, "logits/rejected": -2.287381649017334, "logps/chosen": -0.5993778705596924, "logps/rejected": -0.6894232630729675, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 1.0581930875778198, "rewards/margins": 0.0021799802780151367, "rewards/rejected": 1.0560131072998047, "step": 5471 }, { "epoch": 2.95, "learning_rate": 1.696537772797934e-08, "logits/chosen": -2.1072542667388916, "logits/rejected": -2.3257455825805664, "logps/chosen": -0.3737872242927551, "logps/rejected": -0.35934555530548096, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0220136642456055, "rewards/margins": 0.01348876953125, "rewards/rejected": 1.0085248947143555, "step": 5472 }, { "epoch": 2.95, "learning_rate": 1.6948988153041305e-08, "logits/chosen": -2.1283392906188965, "logits/rejected": -2.1300389766693115, "logps/chosen": -2.0227293968200684, "logps/rejected": -7.2397918701171875, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": 1.4199857711791992, "rewards/margins": 0.7656097412109375, "rewards/rejected": 0.6543760299682617, "step": 5473 }, { "epoch": 2.95, "learning_rate": 1.6932604882802566e-08, "logits/chosen": -2.095419406890869, "logits/rejected": -2.098421573638916, "logps/chosen": -0.3116178810596466, "logps/rejected": -4.851067543029785, "loss": 0.4339, "rewards/accuracies": 1.0, "rewards/chosen": 1.0881621837615967, "rewards/margins": 0.6101292371749878, "rewards/rejected": 0.4780329763889313, "step": 5474 }, { "epoch": 2.95, "learning_rate": 1.6916227920388343e-08, "logits/chosen": -2.1801869869232178, "logits/rejected": -2.293180465698242, "logps/chosen": -1.3773552179336548, "logps/rejected": -1.0779986381530762, "loss": 0.7496, "rewards/accuracies": 0.0, "rewards/chosen": 0.9415844082832336, "rewards/margins": -0.10981088876724243, "rewards/rejected": 1.051395297050476, "step": 5475 }, { "epoch": 2.95, "learning_rate": 1.6899857268922647e-08, "logits/chosen": -2.1725003719329834, "logits/rejected": -2.3647494316101074, "logps/chosen": -13.298585891723633, "logps/rejected": -9.078694343566895, "loss": 0.4878, "rewards/accuracies": 1.0, "rewards/chosen": 1.3006943464279175, "rewards/margins": 0.4641841650009155, "rewards/rejected": 0.836510181427002, "step": 5476 }, { "epoch": 2.95, "learning_rate": 1.6883492931528286e-08, "logits/chosen": -1.9889487028121948, "logits/rejected": -1.9891451597213745, "logps/chosen": -0.9855804443359375, "logps/rejected": -1.1428661346435547, "loss": 0.6431, "rewards/accuracies": 1.0, "rewards/chosen": 0.9377447962760925, "rewards/margins": 0.10277318954467773, "rewards/rejected": 0.8349716067314148, "step": 5477 }, { "epoch": 2.95, "learning_rate": 1.6867134911326875e-08, "logits/chosen": -2.042617082595825, "logits/rejected": -2.26574444770813, "logps/chosen": -5.378283500671387, "logps/rejected": -5.39925479888916, "loss": 0.6706, "rewards/accuracies": 1.0, "rewards/chosen": 0.36153146624565125, "rewards/margins": 0.04557591676712036, "rewards/rejected": 0.3159555494785309, "step": 5478 }, { "epoch": 2.96, "learning_rate": 1.6850783211438803e-08, "logits/chosen": -2.2178030014038086, "logits/rejected": -2.3514931201934814, "logps/chosen": -0.8644164204597473, "logps/rejected": -1.6861648559570312, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": 1.0343431234359741, "rewards/margins": 0.04451882839202881, "rewards/rejected": 0.9898242950439453, "step": 5479 }, { "epoch": 2.96, "learning_rate": 1.6834437834983272e-08, "logits/chosen": -1.969162106513977, "logits/rejected": -1.9817742109298706, "logps/chosen": -5.668244361877441, "logps/rejected": -4.554354667663574, "loss": 0.3509, "rewards/accuracies": 1.0, "rewards/chosen": 1.537209391593933, "rewards/margins": 0.8665639758110046, "rewards/rejected": 0.6706454157829285, "step": 5480 }, { "epoch": 2.96, "learning_rate": 1.6818098785078255e-08, "logits/chosen": -2.1531903743743896, "logits/rejected": -2.2894184589385986, "logps/chosen": -1.233090877532959, "logps/rejected": -1.2764261960983276, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 1.0182983875274658, "rewards/margins": 0.0067861080169677734, "rewards/rejected": 1.011512279510498, "step": 5481 }, { "epoch": 2.96, "learning_rate": 1.6801766064840583e-08, "logits/chosen": -1.950933575630188, "logits/rejected": -2.2697057723999023, "logps/chosen": -0.5316811800003052, "logps/rejected": -0.6613222360610962, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": 1.0230252742767334, "rewards/margins": 0.06870424747467041, "rewards/rejected": 0.954321026802063, "step": 5482 }, { "epoch": 2.96, "learning_rate": 1.6785439677385776e-08, "logits/chosen": -2.05220365524292, "logits/rejected": -2.194458246231079, "logps/chosen": -0.17866474390029907, "logps/rejected": -0.23057496547698975, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.7844682335853577, "rewards/margins": 0.004705607891082764, "rewards/rejected": 0.7797626256942749, "step": 5483 }, { "epoch": 2.96, "learning_rate": 1.6769119625828226e-08, "logits/chosen": -1.9574339389801025, "logits/rejected": -1.956535816192627, "logps/chosen": -0.6495180130004883, "logps/rejected": -1.802749514579773, "loss": 0.5925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9669061899185181, "rewards/margins": 0.21266692876815796, "rewards/rejected": 0.7542392611503601, "step": 5484 }, { "epoch": 2.96, "learning_rate": 1.675280591328108e-08, "logits/chosen": -2.1814470291137695, "logits/rejected": -2.319626808166504, "logps/chosen": -3.0019333362579346, "logps/rejected": -2.5424561500549316, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 0.5968743562698364, "rewards/margins": -0.015365898609161377, "rewards/rejected": 0.6122402548789978, "step": 5485 }, { "epoch": 2.96, "learning_rate": 1.673649854285629e-08, "logits/chosen": -2.0662262439727783, "logits/rejected": -2.267565965652466, "logps/chosen": -0.23484449088573456, "logps/rejected": -0.2633729577064514, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.7993443012237549, "rewards/margins": 0.017050564289093018, "rewards/rejected": 0.7822937369346619, "step": 5486 }, { "epoch": 2.96, "learning_rate": 1.6720197517664603e-08, "logits/chosen": -2.1139638423919678, "logits/rejected": -2.2986199855804443, "logps/chosen": -1.010733723640442, "logps/rejected": -9.668527603149414, "loss": 0.6361, "rewards/accuracies": 1.0, "rewards/chosen": 1.0637215375900269, "rewards/margins": 0.11758488416671753, "rewards/rejected": 0.9461366534233093, "step": 5487 }, { "epoch": 2.96, "learning_rate": 1.6703902840815537e-08, "logits/chosen": -2.02760648727417, "logits/rejected": -2.035862684249878, "logps/chosen": -7.718865394592285, "logps/rejected": -1.5154297351837158, "loss": 0.6044, "rewards/accuracies": 1.0, "rewards/chosen": 1.2713801860809326, "rewards/margins": 0.1860947608947754, "rewards/rejected": 1.0852854251861572, "step": 5488 }, { "epoch": 2.96, "learning_rate": 1.6687614515417414e-08, "logits/chosen": -1.9945359230041504, "logits/rejected": -2.3186326026916504, "logps/chosen": -0.418698787689209, "logps/rejected": -0.43168604373931885, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.9378296136856079, "rewards/margins": 0.01990795135498047, "rewards/rejected": 0.9179216623306274, "step": 5489 }, { "epoch": 2.96, "learning_rate": 1.6671332544577345e-08, "logits/chosen": -2.104964017868042, "logits/rejected": -2.3242623805999756, "logps/chosen": -3.825263023376465, "logps/rejected": -11.484358787536621, "loss": 0.8426, "rewards/accuracies": 0.0, "rewards/chosen": 0.9482758641242981, "rewards/margins": -0.2793636918067932, "rewards/rejected": 1.2276395559310913, "step": 5490 }, { "epoch": 2.96, "learning_rate": 1.665505693140121e-08, "logits/chosen": -2.0624241828918457, "logits/rejected": -2.040595531463623, "logps/chosen": -12.912542343139648, "logps/rejected": -5.027364253997803, "loss": 0.3912, "rewards/accuracies": 1.0, "rewards/chosen": 1.8357164859771729, "rewards/margins": 0.7365883588790894, "rewards/rejected": 1.0991281270980835, "step": 5491 }, { "epoch": 2.96, "learning_rate": 1.6638787678993687e-08, "logits/chosen": -1.9585517644882202, "logits/rejected": -2.2628395557403564, "logps/chosen": -0.23068411648273468, "logps/rejected": -0.22747188806533813, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.8619560599327087, "rewards/margins": 0.03132295608520508, "rewards/rejected": 0.8306331038475037, "step": 5492 }, { "epoch": 2.96, "learning_rate": 1.662252479045827e-08, "logits/chosen": -2.120145797729492, "logits/rejected": -2.2524161338806152, "logps/chosen": -2.9390406608581543, "logps/rejected": -2.960757255554199, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 0.6440191864967346, "rewards/margins": 0.06609976291656494, "rewards/rejected": 0.5779194235801697, "step": 5493 }, { "epoch": 2.96, "learning_rate": 1.66062682688972e-08, "logits/chosen": -2.185920238494873, "logits/rejected": -2.1833221912384033, "logps/chosen": -5.952696800231934, "logps/rejected": -3.897181510925293, "loss": 0.2158, "rewards/accuracies": 1.0, "rewards/chosen": 1.8980926275253296, "rewards/margins": 1.4233232736587524, "rewards/rejected": 0.47476932406425476, "step": 5494 }, { "epoch": 2.96, "learning_rate": 1.659001811741151e-08, "logits/chosen": -1.9929523468017578, "logits/rejected": -2.2684783935546875, "logps/chosen": -0.30142393708229065, "logps/rejected": -0.3389228284358978, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.9000957608222961, "rewards/margins": -0.0018102526664733887, "rewards/rejected": 0.9019060134887695, "step": 5495 }, { "epoch": 2.96, "learning_rate": 1.6573774339101025e-08, "logits/chosen": -2.159726858139038, "logits/rejected": -2.1611595153808594, "logps/chosen": -1.7412066459655762, "logps/rejected": -4.491894721984863, "loss": 0.2603, "rewards/accuracies": 1.0, "rewards/chosen": 1.7268990278244019, "rewards/margins": 1.2128041982650757, "rewards/rejected": 0.5140948295593262, "step": 5496 }, { "epoch": 2.96, "learning_rate": 1.6557536937064363e-08, "logits/chosen": -2.195023536682129, "logits/rejected": -2.1834118366241455, "logps/chosen": -2.959805488586426, "logps/rejected": -11.229361534118652, "loss": 0.5191, "rewards/accuracies": 1.0, "rewards/chosen": 0.9012547731399536, "rewards/margins": 0.38499146699905396, "rewards/rejected": 0.5162633061408997, "step": 5497 }, { "epoch": 2.97, "learning_rate": 1.6541305914398907e-08, "logits/chosen": -2.1239120960235596, "logits/rejected": -2.128493070602417, "logps/chosen": -3.2786953449249268, "logps/rejected": -0.2994022071361542, "loss": 0.5722, "rewards/accuracies": 1.0, "rewards/chosen": 1.2998210191726685, "rewards/margins": 0.25863659381866455, "rewards/rejected": 1.041184425354004, "step": 5498 }, { "epoch": 2.97, "learning_rate": 1.6525081274200847e-08, "logits/chosen": -2.027362108230591, "logits/rejected": -2.034775972366333, "logps/chosen": -4.176347732543945, "logps/rejected": -8.105569839477539, "loss": 0.3728, "rewards/accuracies": 1.0, "rewards/chosen": 0.8975462317466736, "rewards/margins": 0.7946358323097229, "rewards/rejected": 0.10291042178869247, "step": 5499 }, { "epoch": 2.97, "learning_rate": 1.6508863019565113e-08, "logits/chosen": -2.3138556480407715, "logits/rejected": -2.1561923027038574, "logps/chosen": -26.842369079589844, "logps/rejected": -4.099742412567139, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": 2.207113742828369, "rewards/margins": 1.8372130393981934, "rewards/rejected": 0.3699006736278534, "step": 5500 }, { "epoch": 2.97, "learning_rate": 1.649265115358546e-08, "logits/chosen": -2.1095268726348877, "logits/rejected": -2.1100246906280518, "logps/chosen": -3.612790107727051, "logps/rejected": -2.2457737922668457, "loss": 0.3316, "rewards/accuracies": 1.0, "rewards/chosen": 1.5238381624221802, "rewards/margins": 0.9333693981170654, "rewards/rejected": 0.5904687643051147, "step": 5501 }, { "epoch": 2.97, "learning_rate": 1.6476445679354405e-08, "logits/chosen": -1.9741560220718384, "logits/rejected": -1.9816051721572876, "logps/chosen": -0.3759351968765259, "logps/rejected": -12.790382385253906, "loss": 0.2951, "rewards/accuracies": 1.0, "rewards/chosen": 1.0491679906845093, "rewards/margins": 1.0693727731704712, "rewards/rejected": -0.020204735919833183, "step": 5502 }, { "epoch": 2.97, "learning_rate": 1.6460246599963236e-08, "logits/chosen": -2.143404483795166, "logits/rejected": -2.173994541168213, "logps/chosen": -7.853422164916992, "logps/rejected": -16.596450805664062, "loss": 0.377, "rewards/accuracies": 1.0, "rewards/chosen": 1.3296186923980713, "rewards/margins": 0.7811130881309509, "rewards/rejected": 0.5485056042671204, "step": 5503 }, { "epoch": 2.97, "learning_rate": 1.6444053918502065e-08, "logits/chosen": -2.1729533672332764, "logits/rejected": -2.17460560798645, "logps/chosen": -3.730513095855713, "logps/rejected": -3.7292003631591797, "loss": 0.4822, "rewards/accuracies": 1.0, "rewards/chosen": 1.0625022649765015, "rewards/margins": 0.478618323802948, "rewards/rejected": 0.5838839411735535, "step": 5504 }, { "epoch": 2.97, "learning_rate": 1.6427867638059745e-08, "logits/chosen": -2.056187629699707, "logits/rejected": -2.276183843612671, "logps/chosen": -0.2189444601535797, "logps/rejected": -0.20705938339233398, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9697583317756653, "rewards/margins": 0.01444023847579956, "rewards/rejected": 0.9553180932998657, "step": 5505 }, { "epoch": 2.97, "learning_rate": 1.6411687761723903e-08, "logits/chosen": -2.0781965255737305, "logits/rejected": -2.274303436279297, "logps/chosen": -0.3682679533958435, "logps/rejected": -0.29707232117652893, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.8259162902832031, "rewards/margins": 0.015018761157989502, "rewards/rejected": 0.8108975291252136, "step": 5506 }, { "epoch": 2.97, "learning_rate": 1.639551429258097e-08, "logits/chosen": -2.1091017723083496, "logits/rejected": -2.3146305084228516, "logps/chosen": -0.7856763601303101, "logps/rejected": -0.7841175198554993, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.9038990139961243, "rewards/margins": 0.008208930492401123, "rewards/rejected": 0.8956900835037231, "step": 5507 }, { "epoch": 2.97, "learning_rate": 1.6379347233716135e-08, "logits/chosen": -1.9759716987609863, "logits/rejected": -2.256765365600586, "logps/chosen": -4.291200160980225, "logps/rejected": -3.724303722381592, "loss": 0.7093, "rewards/accuracies": 0.0, "rewards/chosen": 0.818454921245575, "rewards/margins": -0.031991422176361084, "rewards/rejected": 0.850446343421936, "step": 5508 }, { "epoch": 2.97, "learning_rate": 1.6363186588213367e-08, "logits/chosen": -2.003150463104248, "logits/rejected": -1.9986387491226196, "logps/chosen": -0.963451087474823, "logps/rejected": -5.466586589813232, "loss": 0.4378, "rewards/accuracies": 1.0, "rewards/chosen": 1.0491688251495361, "rewards/margins": 0.598981499671936, "rewards/rejected": 0.4501873552799225, "step": 5509 }, { "epoch": 2.97, "learning_rate": 1.6347032359155434e-08, "logits/chosen": -2.0562126636505127, "logits/rejected": -2.0534048080444336, "logps/chosen": -5.826698303222656, "logps/rejected": -2.798661470413208, "loss": 0.4821, "rewards/accuracies": 1.0, "rewards/chosen": 1.1352872848510742, "rewards/margins": 0.4789137840270996, "rewards/rejected": 0.6563735008239746, "step": 5510 }, { "epoch": 2.97, "learning_rate": 1.6330884549623842e-08, "logits/chosen": -2.099133014678955, "logits/rejected": -2.3118011951446533, "logps/chosen": -0.16935862600803375, "logps/rejected": -0.23720549046993256, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 1.0242819786071777, "rewards/margins": 0.02175438404083252, "rewards/rejected": 1.0025275945663452, "step": 5511 }, { "epoch": 2.97, "learning_rate": 1.63147431626989e-08, "logits/chosen": -2.077533483505249, "logits/rejected": -2.3439722061157227, "logps/chosen": -1.9748424291610718, "logps/rejected": -2.043931722640991, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 1.004381537437439, "rewards/margins": 0.006712377071380615, "rewards/rejected": 0.9976691603660583, "step": 5512 }, { "epoch": 2.97, "learning_rate": 1.629860820145969e-08, "logits/chosen": -2.075924873352051, "logits/rejected": -2.3013150691986084, "logps/chosen": -1.5750797986984253, "logps/rejected": -1.6729824542999268, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 1.0240827798843384, "rewards/margins": 0.024307072162628174, "rewards/rejected": 0.9997757077217102, "step": 5513 }, { "epoch": 2.97, "learning_rate": 1.628247966898406e-08, "logits/chosen": -2.149205446243286, "logits/rejected": -2.0458967685699463, "logps/chosen": -22.925233840942383, "logps/rejected": -3.5455105304718018, "loss": 0.1631, "rewards/accuracies": 1.0, "rewards/chosen": 2.297980785369873, "rewards/margins": 1.7305282354354858, "rewards/rejected": 0.5674525499343872, "step": 5514 }, { "epoch": 2.97, "learning_rate": 1.6266357568348625e-08, "logits/chosen": -2.0491068363189697, "logits/rejected": -2.2620434761047363, "logps/chosen": -0.3032274842262268, "logps/rejected": -0.2975997030735016, "loss": 0.6728, "rewards/accuracies": 1.0, "rewards/chosen": 0.8936693072319031, "rewards/margins": 0.04104727506637573, "rewards/rejected": 0.8526220321655273, "step": 5515 }, { "epoch": 2.98, "learning_rate": 1.6250241902628796e-08, "logits/chosen": -2.14349627494812, "logits/rejected": -2.1469128131866455, "logps/chosen": -1.162339687347412, "logps/rejected": -4.052968978881836, "loss": 0.4773, "rewards/accuracies": 1.0, "rewards/chosen": 1.0452073812484741, "rewards/margins": 0.4914146065711975, "rewards/rejected": 0.5537927746772766, "step": 5516 }, { "epoch": 2.98, "learning_rate": 1.6234132674898737e-08, "logits/chosen": -2.2975780963897705, "logits/rejected": -2.2054665088653564, "logps/chosen": -37.5097770690918, "logps/rejected": -7.83059549331665, "loss": 0.1342, "rewards/accuracies": 1.0, "rewards/chosen": 2.5240025520324707, "rewards/margins": 1.9408047199249268, "rewards/rejected": 0.5831977725028992, "step": 5517 }, { "epoch": 2.98, "learning_rate": 1.621802988823139e-08, "logits/chosen": -2.102677822113037, "logits/rejected": -2.285597562789917, "logps/chosen": -0.44278329610824585, "logps/rejected": -0.5377837419509888, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8571695685386658, "rewards/margins": 0.009792804718017578, "rewards/rejected": 0.8473767638206482, "step": 5518 }, { "epoch": 2.98, "learning_rate": 1.6201933545698465e-08, "logits/chosen": -2.0763843059539795, "logits/rejected": -2.2960093021392822, "logps/chosen": -0.6512786746025085, "logps/rejected": -0.5800535678863525, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 1.03071928024292, "rewards/margins": 0.03931987285614014, "rewards/rejected": 0.9913994073867798, "step": 5519 }, { "epoch": 2.98, "learning_rate": 1.6185843650370445e-08, "logits/chosen": -2.1354928016662598, "logits/rejected": -2.143838882446289, "logps/chosen": -1.4733316898345947, "logps/rejected": -5.683660984039307, "loss": 0.4351, "rewards/accuracies": 1.0, "rewards/chosen": 1.0581214427947998, "rewards/margins": 0.6066601276397705, "rewards/rejected": 0.4514612853527069, "step": 5520 }, { "epoch": 2.98, "learning_rate": 1.6169760205316596e-08, "logits/chosen": -2.165144205093384, "logits/rejected": -2.1700258255004883, "logps/chosen": -3.6815779209136963, "logps/rejected": -3.7895755767822266, "loss": 0.5497, "rewards/accuracies": 1.0, "rewards/chosen": 0.9526902437210083, "rewards/margins": 0.31086266040802, "rewards/rejected": 0.6418275833129883, "step": 5521 }, { "epoch": 2.98, "learning_rate": 1.615368321360493e-08, "logits/chosen": -1.9689377546310425, "logits/rejected": -1.9718822240829468, "logps/chosen": -2.4619057178497314, "logps/rejected": -0.5800432562828064, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 1.125264286994934, "rewards/margins": 0.1387374997138977, "rewards/rejected": 0.9865267872810364, "step": 5522 }, { "epoch": 2.98, "learning_rate": 1.613761267830224e-08, "logits/chosen": -2.1237246990203857, "logits/rejected": -2.121673822402954, "logps/chosen": -0.9844272136688232, "logps/rejected": -2.4811179637908936, "loss": 0.6085, "rewards/accuracies": 1.0, "rewards/chosen": 1.1763430833816528, "rewards/margins": 0.17706072330474854, "rewards/rejected": 0.9992823600769043, "step": 5523 }, { "epoch": 2.98, "learning_rate": 1.6121548602474096e-08, "logits/chosen": -2.0750083923339844, "logits/rejected": -2.0664029121398926, "logps/chosen": -3.9666807651519775, "logps/rejected": -7.915304183959961, "loss": 0.3094, "rewards/accuracies": 1.0, "rewards/chosen": 1.2350752353668213, "rewards/margins": 1.014495849609375, "rewards/rejected": 0.22057943046092987, "step": 5524 }, { "epoch": 2.98, "learning_rate": 1.610549098918481e-08, "logits/chosen": -2.010268449783325, "logits/rejected": -2.2464993000030518, "logps/chosen": -0.5880038142204285, "logps/rejected": -0.80721515417099, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.7971201539039612, "rewards/margins": 0.018791556358337402, "rewards/rejected": 0.7783285975456238, "step": 5525 }, { "epoch": 2.98, "learning_rate": 1.608943984149748e-08, "logits/chosen": -2.18375301361084, "logits/rejected": -2.182650089263916, "logps/chosen": -0.8664112091064453, "logps/rejected": -8.229691505432129, "loss": 0.5212, "rewards/accuracies": 1.0, "rewards/chosen": 1.2542345523834229, "rewards/margins": 0.37969738245010376, "rewards/rejected": 0.8745371699333191, "step": 5526 }, { "epoch": 2.98, "learning_rate": 1.6073395162473997e-08, "logits/chosen": -2.0923495292663574, "logits/rejected": -2.3222639560699463, "logps/chosen": -1.387850046157837, "logps/rejected": -1.2031620740890503, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.7885845303535461, "rewards/margins": 0.014789879322052002, "rewards/rejected": 0.7737946510314941, "step": 5527 }, { "epoch": 2.98, "learning_rate": 1.605735695517496e-08, "logits/chosen": -2.117642641067505, "logits/rejected": -2.3161513805389404, "logps/chosen": -9.99599838256836, "logps/rejected": -5.51975154876709, "loss": 0.5983, "rewards/accuracies": 1.0, "rewards/chosen": 1.0884672403335571, "rewards/margins": 0.19956886768341064, "rewards/rejected": 0.8888983726501465, "step": 5528 }, { "epoch": 2.98, "learning_rate": 1.60413252226598e-08, "logits/chosen": -2.098327875137329, "logits/rejected": -2.100217580795288, "logps/chosen": -1.0215100049972534, "logps/rejected": -2.7489898204803467, "loss": 0.4597, "rewards/accuracies": 1.0, "rewards/chosen": 1.2994903326034546, "rewards/margins": 0.5386745929718018, "rewards/rejected": 0.7608157396316528, "step": 5529 }, { "epoch": 2.98, "learning_rate": 1.6025299967986627e-08, "logits/chosen": -1.9916472434997559, "logits/rejected": -2.298740863800049, "logps/chosen": -1.041024923324585, "logps/rejected": -1.0371711254119873, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": 1.1187492609024048, "rewards/margins": 0.04483687877655029, "rewards/rejected": 1.0739123821258545, "step": 5530 }, { "epoch": 2.98, "learning_rate": 1.6009281194212388e-08, "logits/chosen": -1.9652705192565918, "logits/rejected": -2.345853567123413, "logps/chosen": -5.869194984436035, "logps/rejected": -6.442368984222412, "loss": 0.6471, "rewards/accuracies": 1.0, "rewards/chosen": 1.2404526472091675, "rewards/margins": 0.09439301490783691, "rewards/rejected": 1.1460596323013306, "step": 5531 }, { "epoch": 2.98, "learning_rate": 1.599326890439277e-08, "logits/chosen": -2.141979694366455, "logits/rejected": -2.140198230743408, "logps/chosen": -0.33025166392326355, "logps/rejected": -3.1433048248291016, "loss": 0.5959, "rewards/accuracies": 1.0, "rewards/chosen": 0.949539840221405, "rewards/margins": 0.20491957664489746, "rewards/rejected": 0.7446202635765076, "step": 5532 }, { "epoch": 2.98, "learning_rate": 1.5977263101582222e-08, "logits/chosen": -2.0133819580078125, "logits/rejected": -2.0198285579681396, "logps/chosen": -1.432671070098877, "logps/rejected": -3.9003195762634277, "loss": 0.4768, "rewards/accuracies": 1.0, "rewards/chosen": 0.995983898639679, "rewards/margins": 0.49290692806243896, "rewards/rejected": 0.50307697057724, "step": 5533 }, { "epoch": 2.98, "learning_rate": 1.5961263788833956e-08, "logits/chosen": -1.999671459197998, "logits/rejected": -1.9762822389602661, "logps/chosen": -9.462237358093262, "logps/rejected": -2.991196870803833, "loss": 0.4621, "rewards/accuracies": 1.0, "rewards/chosen": 1.6107521057128906, "rewards/margins": 0.5320273637771606, "rewards/rejected": 1.07872474193573, "step": 5534 }, { "epoch": 2.99, "learning_rate": 1.594527096919995e-08, "logits/chosen": -2.272926092147827, "logits/rejected": -2.2760956287384033, "logps/chosen": -0.7871065735816956, "logps/rejected": -0.7226401567459106, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.9254429936408997, "rewards/margins": -0.00322645902633667, "rewards/rejected": 0.9286694526672363, "step": 5535 }, { "epoch": 2.99, "learning_rate": 1.592928464573094e-08, "logits/chosen": -2.120182514190674, "logits/rejected": -2.116029977798462, "logps/chosen": -5.304689407348633, "logps/rejected": -4.9380106925964355, "loss": 0.3658, "rewards/accuracies": 1.0, "rewards/chosen": 1.3392413854599, "rewards/margins": 0.8172837495803833, "rewards/rejected": 0.5219576358795166, "step": 5536 }, { "epoch": 2.99, "learning_rate": 1.5913304821476404e-08, "logits/chosen": -2.037276029586792, "logits/rejected": -2.0425865650177, "logps/chosen": -0.9811475276947021, "logps/rejected": -3.4584391117095947, "loss": 0.4552, "rewards/accuracies": 1.0, "rewards/chosen": 1.1235206127166748, "rewards/margins": 0.5507341027259827, "rewards/rejected": 0.5727865099906921, "step": 5537 }, { "epoch": 2.99, "learning_rate": 1.5897331499484628e-08, "logits/chosen": -2.0955586433410645, "logits/rejected": -2.3601527214050293, "logps/chosen": -1.9603915214538574, "logps/rejected": -1.6752586364746094, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 1.100785255432129, "rewards/margins": 0.03654491901397705, "rewards/rejected": 1.0642403364181519, "step": 5538 }, { "epoch": 2.99, "learning_rate": 1.5881364682802618e-08, "logits/chosen": -2.1175127029418945, "logits/rejected": -2.3136160373687744, "logps/chosen": -0.40684545040130615, "logps/rejected": -0.4142448902130127, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.8410162925720215, "rewards/margins": -0.0044664740562438965, "rewards/rejected": 0.8454827666282654, "step": 5539 }, { "epoch": 2.99, "learning_rate": 1.5865404374476154e-08, "logits/chosen": -2.0178005695343018, "logits/rejected": -2.019304037094116, "logps/chosen": -1.7937887907028198, "logps/rejected": -6.922159671783447, "loss": 0.3685, "rewards/accuracies": 1.0, "rewards/chosen": 1.169826865196228, "rewards/margins": 0.808380126953125, "rewards/rejected": 0.36144670844078064, "step": 5540 }, { "epoch": 2.99, "learning_rate": 1.5849450577549762e-08, "logits/chosen": -2.1126210689544678, "logits/rejected": -2.119502067565918, "logps/chosen": -1.2900454998016357, "logps/rejected": -3.3236405849456787, "loss": 0.4207, "rewards/accuracies": 1.0, "rewards/chosen": 1.1487802267074585, "rewards/margins": 0.6479917168617249, "rewards/rejected": 0.5007885098457336, "step": 5541 }, { "epoch": 2.99, "learning_rate": 1.583350329506674e-08, "logits/chosen": -2.1566479206085205, "logits/rejected": -2.131680727005005, "logps/chosen": -7.389315128326416, "logps/rejected": -5.603201866149902, "loss": 0.2758, "rewards/accuracies": 1.0, "rewards/chosen": 1.5390605926513672, "rewards/margins": 1.1469037532806396, "rewards/rejected": 0.39215680956840515, "step": 5542 }, { "epoch": 2.99, "learning_rate": 1.5817562530069135e-08, "logits/chosen": -2.0610413551330566, "logits/rejected": -2.0782909393310547, "logps/chosen": -0.9006665349006653, "logps/rejected": -3.158566951751709, "loss": 0.6084, "rewards/accuracies": 1.0, "rewards/chosen": 1.0036860704421997, "rewards/margins": 0.17725878953933716, "rewards/rejected": 0.8264272809028625, "step": 5543 }, { "epoch": 2.99, "learning_rate": 1.5801628285597757e-08, "logits/chosen": -2.1621227264404297, "logits/rejected": -2.1690266132354736, "logps/chosen": -1.5507525205612183, "logps/rejected": -3.8072338104248047, "loss": 0.4911, "rewards/accuracies": 1.0, "rewards/chosen": 1.0307422876358032, "rewards/margins": 0.4555736780166626, "rewards/rejected": 0.5751686096191406, "step": 5544 }, { "epoch": 2.99, "learning_rate": 1.578570056469216e-08, "logits/chosen": -2.1302075386047363, "logits/rejected": -2.2406563758850098, "logps/chosen": -2.214834451675415, "logps/rejected": -2.12062931060791, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9539548754692078, "rewards/margins": 0.01240450143814087, "rewards/rejected": 0.9415503740310669, "step": 5545 }, { "epoch": 2.99, "learning_rate": 1.576977937039069e-08, "logits/chosen": -2.1545727252960205, "logits/rejected": -2.3385660648345947, "logps/chosen": -7.86971378326416, "logps/rejected": -8.585248947143555, "loss": 0.4293, "rewards/accuracies": 1.0, "rewards/chosen": 1.2171448469161987, "rewards/margins": 0.6233788132667542, "rewards/rejected": 0.5937660336494446, "step": 5546 }, { "epoch": 2.99, "learning_rate": 1.575386470573038e-08, "logits/chosen": -2.1582796573638916, "logits/rejected": -2.2525904178619385, "logps/chosen": -0.1910044550895691, "logps/rejected": -0.21463829278945923, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.8816470503807068, "rewards/margins": 0.01873856782913208, "rewards/rejected": 0.8629084825515747, "step": 5547 }, { "epoch": 2.99, "learning_rate": 1.573795657374706e-08, "logits/chosen": -2.0190091133117676, "logits/rejected": -2.2747390270233154, "logps/chosen": -0.9241506457328796, "logps/rejected": -1.0920833349227905, "loss": 0.6996, "rewards/accuracies": 0.0, "rewards/chosen": 1.0101786851882935, "rewards/margins": -0.012809514999389648, "rewards/rejected": 1.022988200187683, "step": 5548 }, { "epoch": 2.99, "learning_rate": 1.572205497747534e-08, "logits/chosen": -2.0541155338287354, "logits/rejected": -2.060708999633789, "logps/chosen": -1.4737969636917114, "logps/rejected": -4.795138359069824, "loss": 0.4239, "rewards/accuracies": 1.0, "rewards/chosen": 1.1294195652008057, "rewards/margins": 0.6389505863189697, "rewards/rejected": 0.49046897888183594, "step": 5549 }, { "epoch": 2.99, "learning_rate": 1.5706159919948547e-08, "logits/chosen": -2.0104033946990967, "logits/rejected": -2.266604423522949, "logps/chosen": -4.091449737548828, "logps/rejected": -3.8956379890441895, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8852468729019165, "rewards/margins": 0.009782254695892334, "rewards/rejected": 0.8754646182060242, "step": 5550 }, { "epoch": 2.99, "learning_rate": 1.5690271404198763e-08, "logits/chosen": -2.2729427814483643, "logits/rejected": -2.2213730812072754, "logps/chosen": -31.72626304626465, "logps/rejected": -11.704896926879883, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 2.737147808074951, "rewards/margins": 1.9870623350143433, "rewards/rejected": 0.7500854730606079, "step": 5551 }, { "epoch": 2.99, "learning_rate": 1.5674389433256823e-08, "logits/chosen": -2.065140962600708, "logits/rejected": -2.299952983856201, "logps/chosen": -1.2112852334976196, "logps/rejected": -1.1465399265289307, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.783890962600708, "rewards/margins": 0.015461266040802002, "rewards/rejected": 0.768429696559906, "step": 5552 }, { "epoch": 3.0, "learning_rate": 1.565851401015233e-08, "logits/chosen": -2.0560266971588135, "logits/rejected": -2.3261847496032715, "logps/chosen": -0.31393352150917053, "logps/rejected": -2.9265754222869873, "loss": 0.5349, "rewards/accuracies": 1.0, "rewards/chosen": 0.9565321207046509, "rewards/margins": 0.3463010787963867, "rewards/rejected": 0.6102310419082642, "step": 5553 }, { "epoch": 3.0, "learning_rate": 1.564264513791362e-08, "logits/chosen": -2.0630311965942383, "logits/rejected": -2.2506766319274902, "logps/chosen": -0.25401976704597473, "logps/rejected": -0.23806673288345337, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.7183021306991577, "rewards/margins": 0.036631762981414795, "rewards/rejected": 0.6816703677177429, "step": 5554 }, { "epoch": 3.0, "learning_rate": 1.562678281956778e-08, "logits/chosen": -2.059901237487793, "logits/rejected": -2.2588460445404053, "logps/chosen": -0.470625638961792, "logps/rejected": -0.6243001222610474, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.9547246098518372, "rewards/margins": 0.003589808940887451, "rewards/rejected": 0.9511348009109497, "step": 5555 }, { "epoch": 3.0, "learning_rate": 1.561092705814066e-08, "logits/chosen": -2.0706472396850586, "logits/rejected": -2.2195050716400146, "logps/chosen": -0.3397289514541626, "logps/rejected": -0.31933629512786865, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.800462543964386, "rewards/margins": -0.005042552947998047, "rewards/rejected": 0.805505096912384, "step": 5556 }, { "epoch": 3.0, "learning_rate": 1.5595077856656846e-08, "logits/chosen": -1.9837968349456787, "logits/rejected": -2.291804313659668, "logps/chosen": -1.2491575479507446, "logps/rejected": -1.4933675527572632, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.6993560194969177, "rewards/margins": 0.02938896417617798, "rewards/rejected": 0.6699670553207397, "step": 5557 }, { "epoch": 3.0, "learning_rate": 1.557923521813968e-08, "logits/chosen": -2.080845594406128, "logits/rejected": -2.0850207805633545, "logps/chosen": -0.36717140674591064, "logps/rejected": -8.617300033569336, "loss": 0.3292, "rewards/accuracies": 1.0, "rewards/chosen": 0.9906682968139648, "rewards/margins": 0.9421541094779968, "rewards/rejected": 0.04851417616009712, "step": 5558 }, { "epoch": 3.0, "learning_rate": 1.5563399145611257e-08, "logits/chosen": -2.0271856784820557, "logits/rejected": -2.0308353900909424, "logps/chosen": -4.5544843673706055, "logps/rejected": -6.7855095863342285, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 1.2973389625549316, "rewards/margins": 0.7623145580291748, "rewards/rejected": 0.5350244045257568, "step": 5559 }, { "epoch": 3.0, "learning_rate": 1.5547569642092395e-08, "logits/chosen": -2.0740044116973877, "logits/rejected": -2.073716640472412, "logps/chosen": -1.132246971130371, "logps/rejected": -0.9038478136062622, "loss": 0.6411, "rewards/accuracies": 1.0, "rewards/chosen": 0.9738795161247253, "rewards/margins": 0.10703420639038086, "rewards/rejected": 0.8668453097343445, "step": 5560 }, { "epoch": 3.0, "learning_rate": 1.55317467106027e-08, "logits/chosen": -2.0054214000701904, "logits/rejected": -2.236015558242798, "logps/chosen": -1.9082531929016113, "logps/rejected": -1.5178676843643188, "loss": 0.7055, "rewards/accuracies": 0.0, "rewards/chosen": 0.6733273267745972, "rewards/margins": -0.02454841136932373, "rewards/rejected": 0.6978757381439209, "step": 5561 }, { "epoch": 3.0, "learning_rate": 1.5515930354160485e-08, "logits/chosen": -2.118027687072754, "logits/rejected": -2.103919744491577, "logps/chosen": -13.01883316040039, "logps/rejected": -6.225605010986328, "loss": 0.2102, "rewards/accuracies": 1.0, "rewards/chosen": 1.8646093606948853, "rewards/margins": 1.4529328346252441, "rewards/rejected": 0.4116764962673187, "step": 5562 }, { "epoch": 3.0, "learning_rate": 1.550012057578282e-08, "logits/chosen": -2.020960569381714, "logits/rejected": -2.0308680534362793, "logps/chosen": -1.8841594457626343, "logps/rejected": -2.1677708625793457, "loss": 0.4164, "rewards/accuracies": 1.0, "rewards/chosen": 1.29723060131073, "rewards/margins": 0.6607982516288757, "rewards/rejected": 0.6364323496818542, "step": 5563 }, { "epoch": 3.0, "learning_rate": 1.5484317378485545e-08, "logits/chosen": -2.157614231109619, "logits/rejected": -2.3594019412994385, "logps/chosen": -1.1489702463150024, "logps/rejected": -1.0268175601959229, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.8924422264099121, "rewards/margins": 0.012820303440093994, "rewards/rejected": 0.8796219229698181, "step": 5564 }, { "epoch": 3.0, "learning_rate": 1.5468520765283204e-08, "logits/chosen": -2.000472068786621, "logits/rejected": -2.3008346557617188, "logps/chosen": -0.6651411056518555, "logps/rejected": -0.802101194858551, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 0.7905302047729492, "rewards/margins": 0.04602891206741333, "rewards/rejected": 0.7445012927055359, "step": 5565 }, { "epoch": 3.0, "learning_rate": 1.5452730739189122e-08, "logits/chosen": -2.0593948364257812, "logits/rejected": -2.0577545166015625, "logps/chosen": -5.865894317626953, "logps/rejected": -2.414395809173584, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 1.5652275085449219, "rewards/margins": 0.8307417035102844, "rewards/rejected": 0.7344858050346375, "step": 5566 }, { "epoch": 3.0, "learning_rate": 1.5436947303215343e-08, "logits/chosen": -2.083482027053833, "logits/rejected": -2.3202903270721436, "logps/chosen": -2.5805411338806152, "logps/rejected": -8.984930038452148, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.8567360043525696, "rewards/margins": 0.031839191913604736, "rewards/rejected": 0.8248968124389648, "step": 5567 }, { "epoch": 3.0, "learning_rate": 1.5421170460372658e-08, "logits/chosen": -2.028371572494507, "logits/rejected": -2.035054922103882, "logps/chosen": -0.8804989457130432, "logps/rejected": -3.2937440872192383, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 1.0488322973251343, "rewards/margins": 0.523706316947937, "rewards/rejected": 0.5251259803771973, "step": 5568 }, { "epoch": 3.0, "learning_rate": 1.5405400213670612e-08, "logits/chosen": -2.0445609092712402, "logits/rejected": -2.0243403911590576, "logps/chosen": -4.3380255699157715, "logps/rejected": -6.100822925567627, "loss": 0.2572, "rewards/accuracies": 1.0, "rewards/chosen": 1.5551401376724243, "rewards/margins": 1.2264013290405273, "rewards/rejected": 0.32873883843421936, "step": 5569 }, { "epoch": 3.0, "learning_rate": 1.5389636566117475e-08, "logits/chosen": -2.0382394790649414, "logits/rejected": -2.337334394454956, "logps/chosen": -0.6616541743278503, "logps/rejected": -0.600433349609375, "loss": 0.7056, "rewards/accuracies": 0.0, "rewards/chosen": 1.032643437385559, "rewards/margins": -0.024731874465942383, "rewards/rejected": 1.0573753118515015, "step": 5570 }, { "epoch": 3.0, "learning_rate": 1.537387952072026e-08, "logits/chosen": -2.000857353210449, "logits/rejected": -2.008906364440918, "logps/chosen": -1.8108294010162354, "logps/rejected": -4.6294636726379395, "loss": 0.4176, "rewards/accuracies": 1.0, "rewards/chosen": 1.1314178705215454, "rewards/margins": 0.657199501991272, "rewards/rejected": 0.47421833872795105, "step": 5571 }, { "epoch": 3.01, "learning_rate": 1.5358129080484762e-08, "logits/chosen": -2.1014628410339355, "logits/rejected": -2.3214237689971924, "logps/chosen": -0.5535467267036438, "logps/rejected": -6.348858833312988, "loss": 0.604, "rewards/accuracies": 1.0, "rewards/chosen": 1.0857104063034058, "rewards/margins": 0.1869860291481018, "rewards/rejected": 0.898724377155304, "step": 5572 }, { "epoch": 3.01, "learning_rate": 1.534238524841545e-08, "logits/chosen": -2.1768386363983154, "logits/rejected": -2.3305816650390625, "logps/chosen": -1.3221030235290527, "logps/rejected": -1.3452675342559814, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.8673288226127625, "rewards/margins": -0.007880032062530518, "rewards/rejected": 0.875208854675293, "step": 5573 }, { "epoch": 3.01, "learning_rate": 1.5326648027515583e-08, "logits/chosen": -2.1193413734436035, "logits/rejected": -2.1713340282440186, "logps/chosen": -6.770650863647461, "logps/rejected": -9.817153930664062, "loss": 0.3532, "rewards/accuracies": 1.0, "rewards/chosen": 1.520705223083496, "rewards/margins": 0.8587878942489624, "rewards/rejected": 0.6619173288345337, "step": 5574 }, { "epoch": 3.01, "learning_rate": 1.531091742078713e-08, "logits/chosen": -2.2813756465911865, "logits/rejected": -2.220182418823242, "logps/chosen": -10.403777122497559, "logps/rejected": -9.675582885742188, "loss": 0.1859, "rewards/accuracies": 1.0, "rewards/chosen": 2.102285861968994, "rewards/margins": 1.5879452228546143, "rewards/rejected": 0.5143405795097351, "step": 5575 }, { "epoch": 3.01, "learning_rate": 1.5295193431230817e-08, "logits/chosen": -2.1534841060638428, "logits/rejected": -2.3374788761138916, "logps/chosen": -2.761770248413086, "logps/rejected": -2.8675944805145264, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9582306146621704, "rewards/margins": 0.01785910129547119, "rewards/rejected": 0.9403715133666992, "step": 5576 }, { "epoch": 3.01, "learning_rate": 1.5279476061846088e-08, "logits/chosen": -1.9550294876098633, "logits/rejected": -1.9553462266921997, "logps/chosen": -1.3661788702011108, "logps/rejected": -6.367595672607422, "loss": 0.4869, "rewards/accuracies": 1.0, "rewards/chosen": 1.0829564332962036, "rewards/margins": 0.4662904143333435, "rewards/rejected": 0.6166660189628601, "step": 5577 }, { "epoch": 3.01, "learning_rate": 1.5263765315631133e-08, "logits/chosen": -2.013396739959717, "logits/rejected": -2.2963595390319824, "logps/chosen": -0.6774380207061768, "logps/rejected": -0.6972018480300903, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.8912652134895325, "rewards/margins": 0.007531344890594482, "rewards/rejected": 0.883733868598938, "step": 5578 }, { "epoch": 3.01, "learning_rate": 1.5248061195582884e-08, "logits/chosen": -2.129398822784424, "logits/rejected": -2.055659532546997, "logps/chosen": -19.060535430908203, "logps/rejected": -2.5598721504211426, "loss": 0.318, "rewards/accuracies": 1.0, "rewards/chosen": 1.6126571893692017, "rewards/margins": 0.9825241565704346, "rewards/rejected": 0.6301330327987671, "step": 5579 }, { "epoch": 3.01, "learning_rate": 1.5232363704697015e-08, "logits/chosen": -2.0814337730407715, "logits/rejected": -2.303487539291382, "logps/chosen": -0.2961637079715729, "logps/rejected": -0.2675876021385193, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8476164937019348, "rewards/margins": 0.008759677410125732, "rewards/rejected": 0.8388568162918091, "step": 5580 }, { "epoch": 3.01, "learning_rate": 1.5216672845967914e-08, "logits/chosen": -2.0529873371124268, "logits/rejected": -2.296308994293213, "logps/chosen": -2.0446767807006836, "logps/rejected": -1.7580562829971313, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.9482325911521912, "rewards/margins": -0.003450155258178711, "rewards/rejected": 0.9516827464103699, "step": 5581 }, { "epoch": 3.01, "learning_rate": 1.520098862238871e-08, "logits/chosen": -2.179790496826172, "logits/rejected": -2.1801016330718994, "logps/chosen": -0.766893744468689, "logps/rejected": -5.180239200592041, "loss": 0.4023, "rewards/accuracies": 1.0, "rewards/chosen": 1.0811350345611572, "rewards/margins": 0.7027218341827393, "rewards/rejected": 0.3784131705760956, "step": 5582 }, { "epoch": 3.01, "learning_rate": 1.51853110369513e-08, "logits/chosen": -2.0920474529266357, "logits/rejected": -2.25421404838562, "logps/chosen": -0.2854243814945221, "logps/rejected": -0.3683057129383087, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.9297665953636169, "rewards/margins": 0.02899038791656494, "rewards/rejected": 0.900776207447052, "step": 5583 }, { "epoch": 3.01, "learning_rate": 1.5169640092646274e-08, "logits/chosen": -2.117201566696167, "logits/rejected": -2.119049072265625, "logps/chosen": -0.19799385964870453, "logps/rejected": -6.240667343139648, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9968926310539246, "rewards/margins": 0.7619255781173706, "rewards/rejected": 0.23496703803539276, "step": 5584 }, { "epoch": 3.01, "learning_rate": 1.5153975792462965e-08, "logits/chosen": -2.04632830619812, "logits/rejected": -2.24782133102417, "logps/chosen": -0.8472934365272522, "logps/rejected": -1.0559029579162598, "loss": 0.6679, "rewards/accuracies": 1.0, "rewards/chosen": 0.7973204255104065, "rewards/margins": 0.05110293626785278, "rewards/rejected": 0.7462174892425537, "step": 5585 }, { "epoch": 3.01, "learning_rate": 1.5138318139389438e-08, "logits/chosen": -2.1412241458892822, "logits/rejected": -2.147589683532715, "logps/chosen": -3.4981961250305176, "logps/rejected": -2.3521597385406494, "loss": 0.4339, "rewards/accuracies": 1.0, "rewards/chosen": 1.196396827697754, "rewards/margins": 0.610234260559082, "rewards/rejected": 0.5861625671386719, "step": 5586 }, { "epoch": 3.01, "learning_rate": 1.5122667136412508e-08, "logits/chosen": -2.0326409339904785, "logits/rejected": -2.0398569107055664, "logps/chosen": -1.6344866752624512, "logps/rejected": -5.7276692390441895, "loss": 0.3887, "rewards/accuracies": 1.0, "rewards/chosen": 1.0064404010772705, "rewards/margins": 0.744187593460083, "rewards/rejected": 0.2622527778148651, "step": 5587 }, { "epoch": 3.01, "learning_rate": 1.5107022786517692e-08, "logits/chosen": -2.1376678943634033, "logits/rejected": -2.150914192199707, "logps/chosen": -2.7188174724578857, "logps/rejected": -11.148300170898438, "loss": 0.5809, "rewards/accuracies": 1.0, "rewards/chosen": 1.267905592918396, "rewards/margins": 0.23868083953857422, "rewards/rejected": 1.0292247533798218, "step": 5588 }, { "epoch": 3.01, "learning_rate": 1.5091385092689263e-08, "logits/chosen": -2.21651291847229, "logits/rejected": -2.2180397510528564, "logps/chosen": -1.1341872215270996, "logps/rejected": -3.1722424030303955, "loss": 0.4672, "rewards/accuracies": 1.0, "rewards/chosen": 1.2822574377059937, "rewards/margins": 0.5181921720504761, "rewards/rejected": 0.7640652656555176, "step": 5589 }, { "epoch": 3.02, "learning_rate": 1.5075754057910213e-08, "logits/chosen": -2.000850200653076, "logits/rejected": -1.9978828430175781, "logps/chosen": -5.024999141693115, "logps/rejected": -2.3096423149108887, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": 1.745374321937561, "rewards/margins": 1.0516774654388428, "rewards/rejected": 0.693696916103363, "step": 5590 }, { "epoch": 3.02, "learning_rate": 1.506012968516226e-08, "logits/chosen": -2.2572245597839355, "logits/rejected": -2.4398410320281982, "logps/chosen": -9.542667388916016, "logps/rejected": -25.5079345703125, "loss": 0.6374, "rewards/accuracies": 1.0, "rewards/chosen": 1.2130054235458374, "rewards/margins": 0.11481547355651855, "rewards/rejected": 1.0981899499893188, "step": 5591 }, { "epoch": 3.02, "learning_rate": 1.504451197742586e-08, "logits/chosen": -2.107226610183716, "logits/rejected": -2.1097941398620605, "logps/chosen": -0.5033811926841736, "logps/rejected": -5.3550848960876465, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 1.0973752737045288, "rewards/margins": 0.693134069442749, "rewards/rejected": 0.4042412340641022, "step": 5592 }, { "epoch": 3.02, "learning_rate": 1.50289009376802e-08, "logits/chosen": -2.0537991523742676, "logits/rejected": -2.2712700366973877, "logps/chosen": -6.07805061340332, "logps/rejected": -4.251682281494141, "loss": 0.6487, "rewards/accuracies": 1.0, "rewards/chosen": 0.9161584973335266, "rewards/margins": 0.09103983640670776, "rewards/rejected": 0.8251186609268188, "step": 5593 }, { "epoch": 3.02, "learning_rate": 1.501329656890318e-08, "logits/chosen": -2.2454946041107178, "logits/rejected": -2.067934989929199, "logps/chosen": -44.826969146728516, "logps/rejected": -10.275325775146484, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": 2.9236690998077393, "rewards/margins": 1.868630051612854, "rewards/rejected": 1.0550390481948853, "step": 5594 }, { "epoch": 3.02, "learning_rate": 1.499769887407144e-08, "logits/chosen": -2.1849966049194336, "logits/rejected": -2.188225507736206, "logps/chosen": -2.1817827224731445, "logps/rejected": -4.521956920623779, "loss": 0.4774, "rewards/accuracies": 1.0, "rewards/chosen": 1.125226378440857, "rewards/margins": 0.49121248722076416, "rewards/rejected": 0.6340138912200928, "step": 5595 }, { "epoch": 3.02, "learning_rate": 1.498210785616035e-08, "logits/chosen": -2.1483848094940186, "logits/rejected": -2.141444683074951, "logps/chosen": -0.6780035495758057, "logps/rejected": -10.189475059509277, "loss": 0.3159, "rewards/accuracies": 1.0, "rewards/chosen": 1.198485255241394, "rewards/margins": 0.9903801679611206, "rewards/rejected": 0.20810508728027344, "step": 5596 }, { "epoch": 3.02, "learning_rate": 1.4966523518143988e-08, "logits/chosen": -2.0926170349121094, "logits/rejected": -2.0793282985687256, "logps/chosen": -3.678091049194336, "logps/rejected": -6.8024067878723145, "loss": 0.2513, "rewards/accuracies": 1.0, "rewards/chosen": 1.5012325048446655, "rewards/margins": 1.2528079748153687, "rewards/rejected": 0.24842457473278046, "step": 5597 }, { "epoch": 3.02, "learning_rate": 1.4950945862995172e-08, "logits/chosen": -2.075028657913208, "logits/rejected": -2.076246738433838, "logps/chosen": -1.2623149156570435, "logps/rejected": -3.007521152496338, "loss": 0.5175, "rewards/accuracies": 1.0, "rewards/chosen": 1.2246474027633667, "rewards/margins": 0.3889731168746948, "rewards/rejected": 0.8356742858886719, "step": 5598 }, { "epoch": 3.02, "learning_rate": 1.493537489368545e-08, "logits/chosen": -2.073452949523926, "logits/rejected": -2.2699878215789795, "logps/chosen": -1.7138910293579102, "logps/rejected": -1.6327325105667114, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.6197847723960876, "rewards/margins": 0.0024802088737487793, "rewards/rejected": 0.6173045635223389, "step": 5599 }, { "epoch": 3.02, "learning_rate": 1.4919810613185087e-08, "logits/chosen": -2.099468946456909, "logits/rejected": -2.324873924255371, "logps/chosen": -24.079444885253906, "logps/rejected": -25.426780700683594, "loss": 0.7036, "rewards/accuracies": 0.0, "rewards/chosen": 1.0046287775039673, "rewards/margins": -0.020764946937561035, "rewards/rejected": 1.0253937244415283, "step": 5600 }, { "epoch": 3.02, "learning_rate": 1.4904253024463066e-08, "logits/chosen": -2.1376681327819824, "logits/rejected": -2.139125347137451, "logps/chosen": -0.7309321165084839, "logps/rejected": -4.773613452911377, "loss": 0.5032, "rewards/accuracies": 1.0, "rewards/chosen": 0.9445948600769043, "rewards/margins": 0.4247133731842041, "rewards/rejected": 0.5198814868927002, "step": 5601 }, { "epoch": 3.02, "learning_rate": 1.4888702130487103e-08, "logits/chosen": -2.1077115535736084, "logits/rejected": -2.2835357189178467, "logps/chosen": -0.7818641066551208, "logps/rejected": -0.8437779545783997, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9042801856994629, "rewards/margins": 0.017805397510528564, "rewards/rejected": 0.8864747881889343, "step": 5602 }, { "epoch": 3.02, "learning_rate": 1.4873157934223634e-08, "logits/chosen": -2.0577192306518555, "logits/rejected": -2.2999391555786133, "logps/chosen": -0.2778891324996948, "logps/rejected": -0.3087116777896881, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.9422252774238586, "rewards/margins": 0.02404564619064331, "rewards/rejected": 0.9181796312332153, "step": 5603 }, { "epoch": 3.02, "learning_rate": 1.4857620438637803e-08, "logits/chosen": -2.0833418369293213, "logits/rejected": -2.2942774295806885, "logps/chosen": -0.381763756275177, "logps/rejected": -0.4206666052341461, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 1.0386626720428467, "rewards/margins": 0.01510310173034668, "rewards/rejected": 1.0235595703125, "step": 5604 }, { "epoch": 3.02, "learning_rate": 1.4842089646693534e-08, "logits/chosen": -2.0739591121673584, "logits/rejected": -2.075244426727295, "logps/chosen": -0.4106389880180359, "logps/rejected": -5.943086624145508, "loss": 0.4322, "rewards/accuracies": 1.0, "rewards/chosen": 0.9757362604141235, "rewards/margins": 0.614970326423645, "rewards/rejected": 0.3607659339904785, "step": 5605 }, { "epoch": 3.02, "learning_rate": 1.4826565561353389e-08, "logits/chosen": -2.075216770172119, "logits/rejected": -2.3382062911987305, "logps/chosen": -0.3982958197593689, "logps/rejected": -4.561446189880371, "loss": 0.5819, "rewards/accuracies": 1.0, "rewards/chosen": 1.00088369846344, "rewards/margins": 0.23653805255889893, "rewards/rejected": 0.764345645904541, "step": 5606 }, { "epoch": 3.02, "learning_rate": 1.4811048185578695e-08, "logits/chosen": -1.9817991256713867, "logits/rejected": -2.277149200439453, "logps/chosen": -0.326480895280838, "logps/rejected": -0.3338863253593445, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.8891674280166626, "rewards/margins": 0.024758636951446533, "rewards/rejected": 0.8644087910652161, "step": 5607 }, { "epoch": 3.02, "learning_rate": 1.47955375223295e-08, "logits/chosen": -2.168642282485962, "logits/rejected": -2.160459518432617, "logps/chosen": -4.4522385597229, "logps/rejected": -3.8113396167755127, "loss": 0.6495, "rewards/accuracies": 1.0, "rewards/chosen": 0.7821983098983765, "rewards/margins": 0.08922404050827026, "rewards/rejected": 0.6929742693901062, "step": 5608 }, { "epoch": 3.03, "learning_rate": 1.4780033574564565e-08, "logits/chosen": -2.068392038345337, "logits/rejected": -2.2780282497406006, "logps/chosen": -1.6543989181518555, "logps/rejected": -1.8413381576538086, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 1.0725667476654053, "rewards/margins": -0.007009744644165039, "rewards/rejected": 1.0795764923095703, "step": 5609 }, { "epoch": 3.03, "learning_rate": 1.4764536345241374e-08, "logits/chosen": -2.1745529174804688, "logits/rejected": -2.1794252395629883, "logps/chosen": -0.27182871103286743, "logps/rejected": -7.817731857299805, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 1.00962233543396, "rewards/margins": 0.6406960487365723, "rewards/rejected": 0.3689262568950653, "step": 5610 }, { "epoch": 3.03, "learning_rate": 1.474904583731612e-08, "logits/chosen": -2.072176694869995, "logits/rejected": -2.2895474433898926, "logps/chosen": -0.6585174202919006, "logps/rejected": -0.626919150352478, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 1.004973292350769, "rewards/margins": -0.016290783882141113, "rewards/rejected": 1.0212640762329102, "step": 5611 }, { "epoch": 3.03, "learning_rate": 1.4733562053743725e-08, "logits/chosen": -2.221064805984497, "logits/rejected": -2.0792863368988037, "logps/chosen": -40.0878791809082, "logps/rejected": -3.1280763149261475, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": 3.0599286556243896, "rewards/margins": 2.3528144359588623, "rewards/rejected": 0.7071142196655273, "step": 5612 }, { "epoch": 3.03, "learning_rate": 1.471808499747782e-08, "logits/chosen": -2.008488416671753, "logits/rejected": -2.227757692337036, "logps/chosen": -0.7630698084831238, "logps/rejected": -0.7312989234924316, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 1.0395731925964355, "rewards/margins": 0.03250455856323242, "rewards/rejected": 1.0070686340332031, "step": 5613 }, { "epoch": 3.03, "learning_rate": 1.4702614671470753e-08, "logits/chosen": -2.1219358444213867, "logits/rejected": -2.3129663467407227, "logps/chosen": -2.078700542449951, "logps/rejected": -0.9562610387802124, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 1.0200777053833008, "rewards/margins": 0.017661094665527344, "rewards/rejected": 1.0024166107177734, "step": 5614 }, { "epoch": 3.03, "learning_rate": 1.4687151078673583e-08, "logits/chosen": -2.297968864440918, "logits/rejected": -2.2056381702423096, "logps/chosen": -37.412784576416016, "logps/rejected": -7.830273151397705, "loss": 0.133, "rewards/accuracies": 1.0, "rewards/chosen": 2.5337016582489014, "rewards/margins": 1.9504716396331787, "rewards/rejected": 0.5832299590110779, "step": 5615 }, { "epoch": 3.03, "learning_rate": 1.4671694222036119e-08, "logits/chosen": -2.0633232593536377, "logits/rejected": -2.3021137714385986, "logps/chosen": -0.910484790802002, "logps/rejected": -0.8331328630447388, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.8948421478271484, "rewards/margins": 0.003039419651031494, "rewards/rejected": 0.8918027281761169, "step": 5616 }, { "epoch": 3.03, "learning_rate": 1.4656244104506843e-08, "logits/chosen": -2.075800657272339, "logits/rejected": -2.11514949798584, "logps/chosen": -2.589430570602417, "logps/rejected": -9.491832733154297, "loss": 0.23, "rewards/accuracies": 1.0, "rewards/chosen": 1.8258142471313477, "rewards/margins": 1.3523399829864502, "rewards/rejected": 0.47347432374954224, "step": 5617 }, { "epoch": 3.03, "learning_rate": 1.464080072903297e-08, "logits/chosen": -2.047919273376465, "logits/rejected": -2.3246870040893555, "logps/chosen": -2.418367385864258, "logps/rejected": -3.607534170150757, "loss": 0.7128, "rewards/accuracies": 0.0, "rewards/chosen": 1.022137999534607, "rewards/margins": -0.038999199867248535, "rewards/rejected": 1.0611371994018555, "step": 5618 }, { "epoch": 3.03, "learning_rate": 1.4625364098560428e-08, "logits/chosen": -2.1049792766571045, "logits/rejected": -2.1077120304107666, "logps/chosen": -1.5785871744155884, "logps/rejected": -6.030384063720703, "loss": 0.2799, "rewards/accuracies": 1.0, "rewards/chosen": 1.6141777038574219, "rewards/margins": 1.1299234628677368, "rewards/rejected": 0.48425427079200745, "step": 5619 }, { "epoch": 3.03, "learning_rate": 1.4609934216033848e-08, "logits/chosen": -1.9562188386917114, "logits/rejected": -2.306217908859253, "logps/chosen": -2.3574888706207275, "logps/rejected": -0.8107069730758667, "loss": 0.6548, "rewards/accuracies": 1.0, "rewards/chosen": 0.9504607319831848, "rewards/margins": 0.07817822694778442, "rewards/rejected": 0.8722825050354004, "step": 5620 }, { "epoch": 3.03, "learning_rate": 1.4594511084396588e-08, "logits/chosen": -2.198070764541626, "logits/rejected": -2.320115327835083, "logps/chosen": -3.7238802909851074, "logps/rejected": -0.3752708435058594, "loss": 0.7249, "rewards/accuracies": 0.0, "rewards/chosen": 0.9806238412857056, "rewards/margins": -0.06243479251861572, "rewards/rejected": 1.0430586338043213, "step": 5621 }, { "epoch": 3.03, "learning_rate": 1.4579094706590717e-08, "logits/chosen": -2.3033313751220703, "logits/rejected": -2.332573413848877, "logps/chosen": -0.4487892985343933, "logps/rejected": -0.4973297417163849, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.9490726590156555, "rewards/margins": 0.0052315592765808105, "rewards/rejected": 0.9438410997390747, "step": 5622 }, { "epoch": 3.03, "learning_rate": 1.4563685085557014e-08, "logits/chosen": -2.0798826217651367, "logits/rejected": -2.0903117656707764, "logps/chosen": -1.6780576705932617, "logps/rejected": -2.32366943359375, "loss": 0.4746, "rewards/accuracies": 1.0, "rewards/chosen": 1.1810145378112793, "rewards/margins": 0.4986485242843628, "rewards/rejected": 0.6823660135269165, "step": 5623 }, { "epoch": 3.03, "learning_rate": 1.4548282224234947e-08, "logits/chosen": -2.066157102584839, "logits/rejected": -2.3485989570617676, "logps/chosen": -0.4704514443874359, "logps/rejected": -0.5572919249534607, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 1.0634080171585083, "rewards/margins": 0.021898508071899414, "rewards/rejected": 1.0415095090866089, "step": 5624 }, { "epoch": 3.03, "learning_rate": 1.4532886125562727e-08, "logits/chosen": -1.9927661418914795, "logits/rejected": -2.2868967056274414, "logps/chosen": -1.3298717737197876, "logps/rejected": -1.16985285282135, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8248905539512634, "rewards/margins": 0.013599395751953125, "rewards/rejected": 0.8112911581993103, "step": 5625 }, { "epoch": 3.03, "learning_rate": 1.4517496792477257e-08, "logits/chosen": -2.0630970001220703, "logits/rejected": -2.1711928844451904, "logps/chosen": -2.267214775085449, "logps/rejected": -10.689254760742188, "loss": 0.4521, "rewards/accuracies": 1.0, "rewards/chosen": 1.341729998588562, "rewards/margins": 0.5592739582061768, "rewards/rejected": 0.7824560403823853, "step": 5626 }, { "epoch": 3.04, "learning_rate": 1.4502114227914136e-08, "logits/chosen": -2.1728012561798096, "logits/rejected": -2.344362497329712, "logps/chosen": -1.031346082687378, "logps/rejected": -0.9990260601043701, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 1.056018590927124, "rewards/margins": 0.0128248929977417, "rewards/rejected": 1.0431936979293823, "step": 5627 }, { "epoch": 3.04, "learning_rate": 1.4486738434807732e-08, "logits/chosen": -2.038015127182007, "logits/rejected": -2.0364251136779785, "logps/chosen": -2.0502593517303467, "logps/rejected": -6.3313493728637695, "loss": 0.4074, "rewards/accuracies": 1.0, "rewards/chosen": 1.1493228673934937, "rewards/margins": 0.6874071359634399, "rewards/rejected": 0.4619157016277313, "step": 5628 }, { "epoch": 3.04, "learning_rate": 1.4471369416091061e-08, "logits/chosen": -2.132490873336792, "logits/rejected": -2.164807081222534, "logps/chosen": -2.1830880641937256, "logps/rejected": -23.35770606994629, "loss": 0.5718, "rewards/accuracies": 1.0, "rewards/chosen": 1.1430542469024658, "rewards/margins": 0.2595754861831665, "rewards/rejected": 0.8834787607192993, "step": 5629 }, { "epoch": 3.04, "learning_rate": 1.4456007174695856e-08, "logits/chosen": -2.074962854385376, "logits/rejected": -2.2592527866363525, "logps/chosen": -1.5397177934646606, "logps/rejected": -1.5312954187393188, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.7430691123008728, "rewards/margins": 0.02988600730895996, "rewards/rejected": 0.7131831049919128, "step": 5630 }, { "epoch": 3.04, "learning_rate": 1.4440651713552577e-08, "logits/chosen": -1.9911223649978638, "logits/rejected": -2.242966651916504, "logps/chosen": -1.0405229330062866, "logps/rejected": -1.0327128171920776, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 1.1187995672225952, "rewards/margins": 0.051694631576538086, "rewards/rejected": 1.0671049356460571, "step": 5631 }, { "epoch": 3.04, "learning_rate": 1.442530303559037e-08, "logits/chosen": -2.0655744075775146, "logits/rejected": -2.0551598072052, "logps/chosen": -4.403059005737305, "logps/rejected": -2.223480463027954, "loss": 0.3721, "rewards/accuracies": 1.0, "rewards/chosen": 1.7129663228988647, "rewards/margins": 0.7968852519989014, "rewards/rejected": 0.9160810708999634, "step": 5632 }, { "epoch": 3.04, "learning_rate": 1.440996114373711e-08, "logits/chosen": -2.011608362197876, "logits/rejected": -2.014885425567627, "logps/chosen": -3.3438751697540283, "logps/rejected": -1.4862648248672485, "loss": 0.6292, "rewards/accuracies": 1.0, "rewards/chosen": 1.334587812423706, "rewards/margins": 0.1323603391647339, "rewards/rejected": 1.2022274732589722, "step": 5633 }, { "epoch": 3.04, "learning_rate": 1.439462604091935e-08, "logits/chosen": -2.139634847640991, "logits/rejected": -2.1268818378448486, "logps/chosen": -30.61149787902832, "logps/rejected": -27.733646392822266, "loss": 0.1971, "rewards/accuracies": 1.0, "rewards/chosen": 2.2129721641540527, "rewards/margins": 1.5237054824829102, "rewards/rejected": 0.6892666220664978, "step": 5634 }, { "epoch": 3.04, "learning_rate": 1.4379297730062367e-08, "logits/chosen": -2.0568735599517822, "logits/rejected": -2.14245343208313, "logps/chosen": -1.6870768070220947, "logps/rejected": -10.465807914733887, "loss": 0.3265, "rewards/accuracies": 1.0, "rewards/chosen": 1.5644311904907227, "rewards/margins": 0.9515784382820129, "rewards/rejected": 0.6128527522087097, "step": 5635 }, { "epoch": 3.04, "learning_rate": 1.4363976214090146e-08, "logits/chosen": -2.0396616458892822, "logits/rejected": -2.0236568450927734, "logps/chosen": -6.843955039978027, "logps/rejected": -5.0404133796691895, "loss": 0.3356, "rewards/accuracies": 1.0, "rewards/chosen": 1.3846173286437988, "rewards/margins": 0.9193486571311951, "rewards/rejected": 0.46526867151260376, "step": 5636 }, { "epoch": 3.04, "learning_rate": 1.4348661495925358e-08, "logits/chosen": -1.9616093635559082, "logits/rejected": -2.2232611179351807, "logps/chosen": -0.14172682166099548, "logps/rejected": -0.20621341466903687, "loss": 0.7053, "rewards/accuracies": 0.0, "rewards/chosen": 0.9340991377830505, "rewards/margins": -0.024206995964050293, "rewards/rejected": 0.9583061337471008, "step": 5637 }, { "epoch": 3.04, "learning_rate": 1.4333353578489393e-08, "logits/chosen": -2.0273025035858154, "logits/rejected": -2.247568130493164, "logps/chosen": -9.479251861572266, "logps/rejected": -11.500869750976562, "loss": 0.6693, "rewards/accuracies": 1.0, "rewards/chosen": 1.1803292036056519, "rewards/margins": 0.04833722114562988, "rewards/rejected": 1.131991982460022, "step": 5638 }, { "epoch": 3.04, "learning_rate": 1.4318052464702335e-08, "logits/chosen": -2.2153310775756836, "logits/rejected": -2.034942388534546, "logps/chosen": -22.04970359802246, "logps/rejected": -4.472585201263428, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": 2.350048780441284, "rewards/margins": 1.7908966541290283, "rewards/rejected": 0.5591520667076111, "step": 5639 }, { "epoch": 3.04, "learning_rate": 1.430275815748297e-08, "logits/chosen": -2.0236315727233887, "logits/rejected": -2.3027255535125732, "logps/chosen": -1.5162925720214844, "logps/rejected": -0.7516605854034424, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 1.0764920711517334, "rewards/margins": 0.028094172477722168, "rewards/rejected": 1.0483978986740112, "step": 5640 }, { "epoch": 3.04, "learning_rate": 1.4287470659748791e-08, "logits/chosen": -2.0101332664489746, "logits/rejected": -2.011554718017578, "logps/chosen": -6.075606822967529, "logps/rejected": -1.4828935861587524, "loss": 0.4181, "rewards/accuracies": 1.0, "rewards/chosen": 1.2333710193634033, "rewards/margins": 0.6558189988136292, "rewards/rejected": 0.5775520205497742, "step": 5641 }, { "epoch": 3.04, "learning_rate": 1.4272189974415988e-08, "logits/chosen": -2.059981107711792, "logits/rejected": -2.0731070041656494, "logps/chosen": -1.2380520105361938, "logps/rejected": -7.0959882736206055, "loss": 0.4289, "rewards/accuracies": 1.0, "rewards/chosen": 1.2919807434082031, "rewards/margins": 0.6243720650672913, "rewards/rejected": 0.6676086783409119, "step": 5642 }, { "epoch": 3.04, "learning_rate": 1.4256916104399457e-08, "logits/chosen": -2.1403443813323975, "logits/rejected": -2.144345998764038, "logps/chosen": -1.900067687034607, "logps/rejected": -5.865045070648193, "loss": 0.3665, "rewards/accuracies": 1.0, "rewards/chosen": 1.1784446239471436, "rewards/margins": 0.8150603771209717, "rewards/rejected": 0.3633842170238495, "step": 5643 }, { "epoch": 3.04, "learning_rate": 1.4241649052612786e-08, "logits/chosen": -2.0067036151885986, "logits/rejected": -1.9945685863494873, "logps/chosen": -0.717653214931488, "logps/rejected": -3.66340708732605, "loss": 0.52, "rewards/accuracies": 1.0, "rewards/chosen": 1.037042498588562, "rewards/margins": 0.38264280557632446, "rewards/rejected": 0.6543996930122375, "step": 5644 }, { "epoch": 3.04, "learning_rate": 1.4226388821968265e-08, "logits/chosen": -2.082219123840332, "logits/rejected": -2.0828840732574463, "logps/chosen": -1.1387691497802734, "logps/rejected": -1.7798631191253662, "loss": 0.5178, "rewards/accuracies": 1.0, "rewards/chosen": 1.1247241497039795, "rewards/margins": 0.3882293105125427, "rewards/rejected": 0.7364948391914368, "step": 5645 }, { "epoch": 3.05, "learning_rate": 1.421113541537689e-08, "logits/chosen": -2.203977108001709, "logits/rejected": -2.360128164291382, "logps/chosen": -15.599595069885254, "logps/rejected": -3.821828842163086, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.9638434648513794, "rewards/margins": 0.021880805492401123, "rewards/rejected": 0.9419626593589783, "step": 5646 }, { "epoch": 3.05, "learning_rate": 1.4195888835748343e-08, "logits/chosen": -2.075286626815796, "logits/rejected": -2.1413493156433105, "logps/chosen": -10.282928466796875, "logps/rejected": -6.4173994064331055, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 1.1758625507354736, "rewards/margins": 0.04794192314147949, "rewards/rejected": 1.1279206275939941, "step": 5647 }, { "epoch": 3.05, "learning_rate": 1.4180649085991015e-08, "logits/chosen": -1.9658219814300537, "logits/rejected": -1.933949589729309, "logps/chosen": -9.3844633102417, "logps/rejected": -5.863330841064453, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 1.0650439262390137, "rewards/margins": 0.7326632738113403, "rewards/rejected": 0.3323806822299957, "step": 5648 }, { "epoch": 3.05, "learning_rate": 1.4165416169011968e-08, "logits/chosen": -2.0927278995513916, "logits/rejected": -2.0925536155700684, "logps/chosen": -3.003619432449341, "logps/rejected": -3.7361409664154053, "loss": 0.5727, "rewards/accuracies": 1.0, "rewards/chosen": 0.8340012431144714, "rewards/margins": 0.2575022578239441, "rewards/rejected": 0.5764989852905273, "step": 5649 }, { "epoch": 3.05, "learning_rate": 1.4150190087717018e-08, "logits/chosen": -2.009571075439453, "logits/rejected": -2.024134874343872, "logps/chosen": -1.4362082481384277, "logps/rejected": -8.397714614868164, "loss": 0.4633, "rewards/accuracies": 1.0, "rewards/chosen": 1.1664742231369019, "rewards/margins": 0.5288617014884949, "rewards/rejected": 0.637612521648407, "step": 5650 }, { "epoch": 3.05, "learning_rate": 1.4134970845010624e-08, "logits/chosen": -1.9827289581298828, "logits/rejected": -1.9829070568084717, "logps/chosen": -0.796502947807312, "logps/rejected": -2.166091203689575, "loss": 0.5349, "rewards/accuracies": 1.0, "rewards/chosen": 1.0502699613571167, "rewards/margins": 0.34623926877975464, "rewards/rejected": 0.7040306925773621, "step": 5651 }, { "epoch": 3.05, "learning_rate": 1.411975844379597e-08, "logits/chosen": -2.052387237548828, "logits/rejected": -2.1015520095825195, "logps/chosen": -6.436368942260742, "logps/rejected": -17.805036544799805, "loss": 0.5426, "rewards/accuracies": 1.0, "rewards/chosen": 1.053078532218933, "rewards/margins": 0.32791638374328613, "rewards/rejected": 0.725162148475647, "step": 5652 }, { "epoch": 3.05, "learning_rate": 1.410455288697489e-08, "logits/chosen": -2.1041481494903564, "logits/rejected": -2.284554958343506, "logps/chosen": -0.895893931388855, "logps/rejected": -1.0032117366790771, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.8975426554679871, "rewards/margins": 0.0378686785697937, "rewards/rejected": 0.8596739768981934, "step": 5653 }, { "epoch": 3.05, "learning_rate": 1.4089354177447971e-08, "logits/chosen": -2.0651655197143555, "logits/rejected": -2.058997631072998, "logps/chosen": -5.550833702087402, "logps/rejected": -6.209212303161621, "loss": 0.3699, "rewards/accuracies": 1.0, "rewards/chosen": 1.4217603206634521, "rewards/margins": 0.8039489984512329, "rewards/rejected": 0.6178113222122192, "step": 5654 }, { "epoch": 3.05, "learning_rate": 1.4074162318114458e-08, "logits/chosen": -2.1655993461608887, "logits/rejected": -2.256943941116333, "logps/chosen": -1.9026232957839966, "logps/rejected": -0.605130672454834, "loss": 0.6439, "rewards/accuracies": 1.0, "rewards/chosen": 0.9926873445510864, "rewards/margins": 0.10108006000518799, "rewards/rejected": 0.8916072845458984, "step": 5655 }, { "epoch": 3.05, "learning_rate": 1.4058977311872295e-08, "logits/chosen": -2.037061929702759, "logits/rejected": -2.0415825843811035, "logps/chosen": -0.904376745223999, "logps/rejected": -3.378958225250244, "loss": 0.4553, "rewards/accuracies": 1.0, "rewards/chosen": 1.1311976909637451, "rewards/margins": 0.550463080406189, "rewards/rejected": 0.5807346105575562, "step": 5656 }, { "epoch": 3.05, "learning_rate": 1.404379916161813e-08, "logits/chosen": -2.0665924549102783, "logits/rejected": -2.011131763458252, "logps/chosen": -7.594085693359375, "logps/rejected": -8.302762031555176, "loss": 0.3792, "rewards/accuracies": 1.0, "rewards/chosen": 1.5323219299316406, "rewards/margins": 0.7741380333900452, "rewards/rejected": 0.7581838965415955, "step": 5657 }, { "epoch": 3.05, "learning_rate": 1.4028627870247294e-08, "logits/chosen": -2.120643138885498, "logits/rejected": -2.129237413406372, "logps/chosen": -1.4994827508926392, "logps/rejected": -4.348135948181152, "loss": 0.4151, "rewards/accuracies": 1.0, "rewards/chosen": 1.3607107400894165, "rewards/margins": 0.6644033789634705, "rewards/rejected": 0.696307361125946, "step": 5658 }, { "epoch": 3.05, "learning_rate": 1.4013463440653811e-08, "logits/chosen": -2.192324638366699, "logits/rejected": -2.0550174713134766, "logps/chosen": -29.146953582763672, "logps/rejected": -18.406688690185547, "loss": 0.249, "rewards/accuracies": 1.0, "rewards/chosen": 2.0095226764678955, "rewards/margins": 1.2631642818450928, "rewards/rejected": 0.746358335018158, "step": 5659 }, { "epoch": 3.05, "learning_rate": 1.3998305875730382e-08, "logits/chosen": -2.027313470840454, "logits/rejected": -2.401785373687744, "logps/chosen": -6.344474792480469, "logps/rejected": -12.486202239990234, "loss": 0.9342, "rewards/accuracies": 0.0, "rewards/chosen": 1.0135868787765503, "rewards/margins": -0.4352083206176758, "rewards/rejected": 1.448795199394226, "step": 5660 }, { "epoch": 3.05, "learning_rate": 1.3983155178368445e-08, "logits/chosen": -2.061115264892578, "logits/rejected": -2.0697882175445557, "logps/chosen": -2.7616326808929443, "logps/rejected": -10.993385314941406, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 1.2943247556686401, "rewards/margins": 0.9167048931121826, "rewards/rejected": 0.37761983275413513, "step": 5661 }, { "epoch": 3.05, "learning_rate": 1.3968011351458075e-08, "logits/chosen": -2.114480972290039, "logits/rejected": -2.121072292327881, "logps/chosen": -1.9824087619781494, "logps/rejected": -3.6919596195220947, "loss": 0.4693, "rewards/accuracies": 1.0, "rewards/chosen": 1.1405712366104126, "rewards/margins": 0.5127536058425903, "rewards/rejected": 0.6278176307678223, "step": 5662 }, { "epoch": 3.05, "learning_rate": 1.395287439788806e-08, "logits/chosen": -2.2615246772766113, "logits/rejected": -2.2515411376953125, "logps/chosen": -0.9996020793914795, "logps/rejected": -4.911154747009277, "loss": 0.4281, "rewards/accuracies": 1.0, "rewards/chosen": 1.004211664199829, "rewards/margins": 0.626865804195404, "rewards/rejected": 0.37734586000442505, "step": 5663 }, { "epoch": 3.06, "learning_rate": 1.393774432054587e-08, "logits/chosen": -2.044218063354492, "logits/rejected": -2.311053514480591, "logps/chosen": -2.131294012069702, "logps/rejected": -1.8459434509277344, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.7356509566307068, "rewards/margins": -0.003368854522705078, "rewards/rejected": 0.7390198111534119, "step": 5664 }, { "epoch": 3.06, "learning_rate": 1.3922621122317679e-08, "logits/chosen": -2.0977370738983154, "logits/rejected": -2.321643352508545, "logps/chosen": -7.458809852600098, "logps/rejected": -5.30801248550415, "loss": 0.8798, "rewards/accuracies": 0.0, "rewards/chosen": 0.38037750124931335, "rewards/margins": -0.34387364983558655, "rewards/rejected": 0.7242511510848999, "step": 5665 }, { "epoch": 3.06, "learning_rate": 1.3907504806088321e-08, "logits/chosen": -1.9836499691009521, "logits/rejected": -2.2695744037628174, "logps/chosen": -7.861726760864258, "logps/rejected": -8.391890525817871, "loss": 0.6611, "rewards/accuracies": 1.0, "rewards/chosen": 0.5657009482383728, "rewards/margins": 0.0650641918182373, "rewards/rejected": 0.5006367564201355, "step": 5666 }, { "epoch": 3.06, "learning_rate": 1.389239537474135e-08, "logits/chosen": -2.1553969383239746, "logits/rejected": -2.156404972076416, "logps/chosen": -1.1836793422698975, "logps/rejected": -12.79594612121582, "loss": 0.3152, "rewards/accuracies": 1.0, "rewards/chosen": 1.1922415494918823, "rewards/margins": 0.9928379654884338, "rewards/rejected": 0.1994035691022873, "step": 5667 }, { "epoch": 3.06, "learning_rate": 1.3877292831158982e-08, "logits/chosen": -2.1352813243865967, "logits/rejected": -2.1351661682128906, "logps/chosen": -0.7196446657180786, "logps/rejected": -1.4897207021713257, "loss": 0.5437, "rewards/accuracies": 1.0, "rewards/chosen": 1.0791634321212769, "rewards/margins": 0.3251425623893738, "rewards/rejected": 0.7540208697319031, "step": 5668 }, { "epoch": 3.06, "learning_rate": 1.386219717822214e-08, "logits/chosen": -2.129784107208252, "logits/rejected": -2.322007656097412, "logps/chosen": -3.9908676147460938, "logps/rejected": -4.021641731262207, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.502795398235321, "rewards/margins": 0.018188178539276123, "rewards/rejected": 0.4846072196960449, "step": 5669 }, { "epoch": 3.06, "learning_rate": 1.3847108418810388e-08, "logits/chosen": -2.130811929702759, "logits/rejected": -2.132950782775879, "logps/chosen": -1.7419945001602173, "logps/rejected": -1.6878918409347534, "loss": 0.5599, "rewards/accuracies": 1.0, "rewards/chosen": 1.0157874822616577, "rewards/margins": 0.28711819648742676, "rewards/rejected": 0.728669285774231, "step": 5670 }, { "epoch": 3.06, "learning_rate": 1.3832026555802029e-08, "logits/chosen": -2.1752865314483643, "logits/rejected": -2.3534719944000244, "logps/chosen": -1.6009838581085205, "logps/rejected": -1.587097406387329, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 1.0301705598831177, "rewards/margins": 0.021655559539794922, "rewards/rejected": 1.0085150003433228, "step": 5671 }, { "epoch": 3.06, "learning_rate": 1.3816951592074012e-08, "logits/chosen": -2.1315245628356934, "logits/rejected": -2.2864747047424316, "logps/chosen": -0.32030826807022095, "logps/rejected": -0.3023597002029419, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 1.0185264348983765, "rewards/margins": 0.029392480850219727, "rewards/rejected": 0.9891339540481567, "step": 5672 }, { "epoch": 3.06, "learning_rate": 1.3801883530502011e-08, "logits/chosen": -2.123204231262207, "logits/rejected": -2.130899429321289, "logps/chosen": -3.9038565158843994, "logps/rejected": -4.565234184265137, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 1.2036365270614624, "rewards/margins": 0.6748769283294678, "rewards/rejected": 0.5287595987319946, "step": 5673 }, { "epoch": 3.06, "learning_rate": 1.378682237396035e-08, "logits/chosen": -2.117006540298462, "logits/rejected": -2.3089146614074707, "logps/chosen": -10.976469993591309, "logps/rejected": -0.5393003225326538, "loss": 0.8139, "rewards/accuracies": 0.0, "rewards/chosen": 0.6880988478660583, "rewards/margins": -0.22841793298721313, "rewards/rejected": 0.9165167808532715, "step": 5674 }, { "epoch": 3.06, "learning_rate": 1.3771768125322041e-08, "logits/chosen": -2.0085253715515137, "logits/rejected": -2.0185537338256836, "logps/chosen": -0.8599507808685303, "logps/rejected": -5.938680648803711, "loss": 0.3152, "rewards/accuracies": 1.0, "rewards/chosen": 1.5557241439819336, "rewards/margins": 0.9927886724472046, "rewards/rejected": 0.562935471534729, "step": 5675 }, { "epoch": 3.06, "learning_rate": 1.3756720787458781e-08, "logits/chosen": -2.1610283851623535, "logits/rejected": -2.3143646717071533, "logps/chosen": -2.256884813308716, "logps/rejected": -0.42571988701820374, "loss": 0.7062, "rewards/accuracies": 0.0, "rewards/chosen": 0.8919116854667664, "rewards/margins": -0.025921940803527832, "rewards/rejected": 0.9178336262702942, "step": 5676 }, { "epoch": 3.06, "learning_rate": 1.3741680363240948e-08, "logits/chosen": -2.0160961151123047, "logits/rejected": -2.0200841426849365, "logps/chosen": -1.217695713043213, "logps/rejected": -5.850285530090332, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 1.0906089544296265, "rewards/margins": 0.7938586473464966, "rewards/rejected": 0.2967502772808075, "step": 5677 }, { "epoch": 3.06, "learning_rate": 1.3726646855537605e-08, "logits/chosen": -2.0325093269348145, "logits/rejected": -2.256364583969116, "logps/chosen": -0.19542565941810608, "logps/rejected": -0.2012128233909607, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 1.0305300951004028, "rewards/margins": 0.046003520488739014, "rewards/rejected": 0.9845265746116638, "step": 5678 }, { "epoch": 3.06, "learning_rate": 1.371162026721649e-08, "logits/chosen": -2.09041166305542, "logits/rejected": -2.065822124481201, "logps/chosen": -5.263885498046875, "logps/rejected": -3.698930501937866, "loss": 0.3204, "rewards/accuracies": 1.0, "rewards/chosen": 1.5886558294296265, "rewards/margins": 0.973662257194519, "rewards/rejected": 0.6149935722351074, "step": 5679 }, { "epoch": 3.06, "learning_rate": 1.3696600601144032e-08, "logits/chosen": -2.174503803253174, "logits/rejected": -2.178893566131592, "logps/chosen": -4.057248592376709, "logps/rejected": -5.4752936363220215, "loss": 0.4118, "rewards/accuracies": 1.0, "rewards/chosen": 1.0701677799224854, "rewards/margins": 0.6743068695068359, "rewards/rejected": 0.3958609104156494, "step": 5680 }, { "epoch": 3.06, "learning_rate": 1.3681587860185323e-08, "logits/chosen": -2.042592763900757, "logits/rejected": -2.046489953994751, "logps/chosen": -3.141122341156006, "logps/rejected": -4.704151630401611, "loss": 0.4695, "rewards/accuracies": 1.0, "rewards/chosen": 1.0552829504013062, "rewards/margins": 0.5122823119163513, "rewards/rejected": 0.5430006384849548, "step": 5681 }, { "epoch": 3.06, "learning_rate": 1.3666582047204145e-08, "logits/chosen": -2.049898386001587, "logits/rejected": -2.0442516803741455, "logps/chosen": -3.2887930870056152, "logps/rejected": -1.3720678091049194, "loss": 0.6324, "rewards/accuracies": 1.0, "rewards/chosen": 0.8845064043998718, "rewards/margins": 0.12542706727981567, "rewards/rejected": 0.7590793371200562, "step": 5682 }, { "epoch": 3.07, "learning_rate": 1.3651583165062958e-08, "logits/chosen": -2.1444666385650635, "logits/rejected": -2.306190252304077, "logps/chosen": -4.442117691040039, "logps/rejected": -10.162153244018555, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 1.0372027158737183, "rewards/margins": 0.015794038772583008, "rewards/rejected": 1.0214086771011353, "step": 5683 }, { "epoch": 3.07, "learning_rate": 1.36365912166229e-08, "logits/chosen": -2.15343976020813, "logits/rejected": -2.3036088943481445, "logps/chosen": -0.5690246224403381, "logps/rejected": -0.7292542457580566, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9742045402526855, "rewards/margins": 0.022479891777038574, "rewards/rejected": 0.951724648475647, "step": 5684 }, { "epoch": 3.07, "learning_rate": 1.362160620474378e-08, "logits/chosen": -2.057997226715088, "logits/rejected": -2.0634372234344482, "logps/chosen": -2.9735774993896484, "logps/rejected": -6.296367168426514, "loss": 0.4009, "rewards/accuracies": 1.0, "rewards/chosen": 1.0563093423843384, "rewards/margins": 0.7070080041885376, "rewards/rejected": 0.3493013083934784, "step": 5685 }, { "epoch": 3.07, "learning_rate": 1.3606628132284086e-08, "logits/chosen": -2.197120428085327, "logits/rejected": -2.40936541557312, "logps/chosen": -1.174742579460144, "logps/rejected": -1.2592777013778687, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.812732994556427, "rewards/margins": -0.0001678466796875, "rewards/rejected": 0.8129008412361145, "step": 5686 }, { "epoch": 3.07, "learning_rate": 1.3591657002100992e-08, "logits/chosen": -2.047830581665039, "logits/rejected": -2.2907915115356445, "logps/chosen": -9.06545639038086, "logps/rejected": -9.548730850219727, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 1.1728438138961792, "rewards/margins": 0.016271591186523438, "rewards/rejected": 1.1565722227096558, "step": 5687 }, { "epoch": 3.07, "learning_rate": 1.357669281705034e-08, "logits/chosen": -1.9959981441497803, "logits/rejected": -1.9993234872817993, "logps/chosen": -0.9276307225227356, "logps/rejected": -1.7679221630096436, "loss": 0.5125, "rewards/accuracies": 1.0, "rewards/chosen": 1.1642073392868042, "rewards/margins": 0.4013877511024475, "rewards/rejected": 0.7628195881843567, "step": 5688 }, { "epoch": 3.07, "learning_rate": 1.3561735579986639e-08, "logits/chosen": -2.1441650390625, "logits/rejected": -2.158370018005371, "logps/chosen": -0.3206389248371124, "logps/rejected": -9.462812423706055, "loss": 0.4472, "rewards/accuracies": 1.0, "rewards/chosen": 0.9810364842414856, "rewards/margins": 0.5728539228439331, "rewards/rejected": 0.4081825315952301, "step": 5689 }, { "epoch": 3.07, "learning_rate": 1.3546785293763086e-08, "logits/chosen": -2.0565357208251953, "logits/rejected": -2.282757520675659, "logps/chosen": -1.3401415348052979, "logps/rejected": -46.232887268066406, "loss": 0.2482, "rewards/accuracies": 1.0, "rewards/chosen": 0.8737795948982239, "rewards/margins": 1.2670108079910278, "rewards/rejected": -0.39323121309280396, "step": 5690 }, { "epoch": 3.07, "learning_rate": 1.353184196123155e-08, "logits/chosen": -2.126901865005493, "logits/rejected": -2.2564802169799805, "logps/chosen": -1.1015058755874634, "logps/rejected": -0.969636082649231, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 1.0932046175003052, "rewards/margins": 0.011777997016906738, "rewards/rejected": 1.0814266204833984, "step": 5691 }, { "epoch": 3.07, "learning_rate": 1.3516905585242566e-08, "logits/chosen": -2.0816712379455566, "logits/rejected": -2.0846071243286133, "logps/chosen": -0.37976956367492676, "logps/rejected": -8.677809715270996, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 0.9894084930419922, "rewards/margins": 0.946945309638977, "rewards/rejected": 0.042463209480047226, "step": 5692 }, { "epoch": 3.07, "learning_rate": 1.3501976168645346e-08, "logits/chosen": -2.133180618286133, "logits/rejected": -2.14473819732666, "logps/chosen": -7.14784049987793, "logps/rejected": -1.891919493675232, "loss": 0.5408, "rewards/accuracies": 1.0, "rewards/chosen": 1.1059401035308838, "rewards/margins": 0.3320619463920593, "rewards/rejected": 0.7738781571388245, "step": 5693 }, { "epoch": 3.07, "learning_rate": 1.3487053714287766e-08, "logits/chosen": -1.9746390581130981, "logits/rejected": -1.982572317123413, "logps/chosen": -1.4440850019454956, "logps/rejected": -3.3153269290924072, "loss": 0.4473, "rewards/accuracies": 1.0, "rewards/chosen": 1.141830563545227, "rewards/margins": 0.5726725459098816, "rewards/rejected": 0.5691580176353455, "step": 5694 }, { "epoch": 3.07, "learning_rate": 1.3472138225016404e-08, "logits/chosen": -2.0931475162506104, "logits/rejected": -2.3154895305633545, "logps/chosen": -0.3030487596988678, "logps/rejected": -0.39497968554496765, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.8723239898681641, "rewards/margins": -0.0055024027824401855, "rewards/rejected": 0.8778263926506042, "step": 5695 }, { "epoch": 3.07, "learning_rate": 1.3457229703676482e-08, "logits/chosen": -2.0803329944610596, "logits/rejected": -2.0861146450042725, "logps/chosen": -0.3229998052120209, "logps/rejected": -8.739391326904297, "loss": 0.3475, "rewards/accuracies": 1.0, "rewards/chosen": 1.1901935338974, "rewards/margins": 0.8782593011856079, "rewards/rejected": 0.3119342029094696, "step": 5696 }, { "epoch": 3.07, "learning_rate": 1.3442328153111898e-08, "logits/chosen": -2.1660218238830566, "logits/rejected": -2.3366878032684326, "logps/chosen": -0.9627299904823303, "logps/rejected": -0.9876900911331177, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9735874533653259, "rewards/margins": 0.01437467336654663, "rewards/rejected": 0.9592127799987793, "step": 5697 }, { "epoch": 3.07, "learning_rate": 1.3427433576165215e-08, "logits/chosen": -2.0949547290802, "logits/rejected": -2.391408681869507, "logps/chosen": -1.816267490386963, "logps/rejected": -1.6236634254455566, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 1.115197777748108, "rewards/margins": 0.04580247402191162, "rewards/rejected": 1.0693953037261963, "step": 5698 }, { "epoch": 3.07, "learning_rate": 1.3412545975677692e-08, "logits/chosen": -2.1536850929260254, "logits/rejected": -2.137054681777954, "logps/chosen": -7.784499168395996, "logps/rejected": -4.6547698974609375, "loss": 0.295, "rewards/accuracies": 1.0, "rewards/chosen": 1.5660386085510254, "rewards/margins": 1.0694921016693115, "rewards/rejected": 0.49654656648635864, "step": 5699 }, { "epoch": 3.07, "learning_rate": 1.3397665354489207e-08, "logits/chosen": -2.17045259475708, "logits/rejected": -2.2529990673065186, "logps/chosen": -4.154566764831543, "logps/rejected": -15.349759101867676, "loss": 0.4893, "rewards/accuracies": 1.0, "rewards/chosen": 1.23729407787323, "rewards/margins": 0.46008753776550293, "rewards/rejected": 0.777206540107727, "step": 5700 }, { "epoch": 3.07, "learning_rate": 1.3382791715438351e-08, "logits/chosen": -2.141705274581909, "logits/rejected": -2.145885467529297, "logps/chosen": -1.178069829940796, "logps/rejected": -2.182832956314087, "loss": 0.4213, "rewards/accuracies": 1.0, "rewards/chosen": 1.2217772006988525, "rewards/margins": 0.6462326645851135, "rewards/rejected": 0.575544536113739, "step": 5701 }, { "epoch": 3.08, "learning_rate": 1.3367925061362372e-08, "logits/chosen": -2.11441969871521, "logits/rejected": -2.347980499267578, "logps/chosen": -0.46229180693626404, "logps/rejected": -0.5286009907722473, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9339444041252136, "rewards/margins": 0.0036286115646362305, "rewards/rejected": 0.9303157925605774, "step": 5702 }, { "epoch": 3.08, "learning_rate": 1.335306539509718e-08, "logits/chosen": -2.135399341583252, "logits/rejected": -2.143848419189453, "logps/chosen": -1.5277045965194702, "logps/rejected": -2.564321994781494, "loss": 0.4086, "rewards/accuracies": 1.0, "rewards/chosen": 1.4903662204742432, "rewards/margins": 0.6838230490684509, "rewards/rejected": 0.8065431714057922, "step": 5703 }, { "epoch": 3.08, "learning_rate": 1.3338212719477349e-08, "logits/chosen": -2.155484914779663, "logits/rejected": -2.2832329273223877, "logps/chosen": -5.724812984466553, "logps/rejected": -4.832899570465088, "loss": 0.6614, "rewards/accuracies": 1.0, "rewards/chosen": 1.044194221496582, "rewards/margins": 0.06449335813522339, "rewards/rejected": 0.9797008633613586, "step": 5704 }, { "epoch": 3.08, "learning_rate": 1.332336703733612e-08, "logits/chosen": -2.0719778537750244, "logits/rejected": -2.0615975856781006, "logps/chosen": -3.6991539001464844, "logps/rejected": -7.4983134269714355, "loss": 0.2886, "rewards/accuracies": 1.0, "rewards/chosen": 1.4386167526245117, "rewards/margins": 1.095118522644043, "rewards/rejected": 0.3434982895851135, "step": 5705 }, { "epoch": 3.08, "learning_rate": 1.3308528351505432e-08, "logits/chosen": -2.144855260848999, "logits/rejected": -2.1450133323669434, "logps/chosen": -0.6098855137825012, "logps/rejected": -2.448687791824341, "loss": 0.4759, "rewards/accuracies": 1.0, "rewards/chosen": 1.2194710969924927, "rewards/margins": 0.49504679441452026, "rewards/rejected": 0.7244243025779724, "step": 5706 }, { "epoch": 3.08, "learning_rate": 1.3293696664815852e-08, "logits/chosen": -2.236755847930908, "logits/rejected": -2.0741965770721436, "logps/chosen": -32.941123962402344, "logps/rejected": -4.270382404327393, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": 3.0108134746551514, "rewards/margins": 2.531492233276367, "rewards/rejected": 0.479321151971817, "step": 5707 }, { "epoch": 3.08, "learning_rate": 1.3278871980096607e-08, "logits/chosen": -2.021855592727661, "logits/rejected": -2.3034074306488037, "logps/chosen": -0.18863260746002197, "logps/rejected": -0.22218170762062073, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.9741224646568298, "rewards/margins": 0.02059251070022583, "rewards/rejected": 0.953529953956604, "step": 5708 }, { "epoch": 3.08, "learning_rate": 1.326405430017562e-08, "logits/chosen": -2.0968363285064697, "logits/rejected": -2.096328020095825, "logps/chosen": -0.9087291359901428, "logps/rejected": -1.9785423278808594, "loss": 0.6236, "rewards/accuracies": 1.0, "rewards/chosen": 0.8791929483413696, "rewards/margins": 0.14419549703598022, "rewards/rejected": 0.7349974513053894, "step": 5709 }, { "epoch": 3.08, "learning_rate": 1.3249243627879459e-08, "logits/chosen": -2.047208309173584, "logits/rejected": -2.2596776485443115, "logps/chosen": -3.050250291824341, "logps/rejected": -3.0558555126190186, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.505894124507904, "rewards/margins": 0.03456699848175049, "rewards/rejected": 0.47132712602615356, "step": 5710 }, { "epoch": 3.08, "learning_rate": 1.3234439966033344e-08, "logits/chosen": -2.081773519515991, "logits/rejected": -2.322260856628418, "logps/chosen": -5.121401309967041, "logps/rejected": -1.3613299131393433, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0793135166168213, "rewards/margins": 0.011044025421142578, "rewards/rejected": 1.0682694911956787, "step": 5711 }, { "epoch": 3.08, "learning_rate": 1.3219643317461193e-08, "logits/chosen": -2.053345203399658, "logits/rejected": -2.0446722507476807, "logps/chosen": -2.31002140045166, "logps/rejected": -7.59564733505249, "loss": 0.4094, "rewards/accuracies": 1.0, "rewards/chosen": 0.9139540791511536, "rewards/margins": 0.6814520359039307, "rewards/rejected": 0.2325020283460617, "step": 5712 }, { "epoch": 3.08, "learning_rate": 1.3204853684985545e-08, "logits/chosen": -2.0496997833251953, "logits/rejected": -2.052595615386963, "logps/chosen": -0.2076653242111206, "logps/rejected": -4.401008129119873, "loss": 0.447, "rewards/accuracies": 1.0, "rewards/chosen": 0.9149705767631531, "rewards/margins": 0.5733832120895386, "rewards/rejected": 0.3415873944759369, "step": 5713 }, { "epoch": 3.08, "learning_rate": 1.3190071071427634e-08, "logits/chosen": -2.1059248447418213, "logits/rejected": -2.1012349128723145, "logps/chosen": -11.716290473937988, "logps/rejected": -1.0955965518951416, "loss": 0.7662, "rewards/accuracies": 0.0, "rewards/chosen": 0.8467883467674255, "rewards/margins": -0.1410989761352539, "rewards/rejected": 0.9878873229026794, "step": 5714 }, { "epoch": 3.08, "learning_rate": 1.3175295479607335e-08, "logits/chosen": -2.250856399536133, "logits/rejected": -2.31809401512146, "logps/chosen": -8.151671409606934, "logps/rejected": -6.4855828285217285, "loss": 0.6322, "rewards/accuracies": 1.0, "rewards/chosen": 1.0997045040130615, "rewards/margins": 0.12590628862380981, "rewards/rejected": 0.9737982153892517, "step": 5715 }, { "epoch": 3.08, "learning_rate": 1.316052691234319e-08, "logits/chosen": -2.065044403076172, "logits/rejected": -2.0697951316833496, "logps/chosen": -6.820925712585449, "logps/rejected": -8.930782318115234, "loss": 0.2464, "rewards/accuracies": 1.0, "rewards/chosen": 1.8192577362060547, "rewards/margins": 1.2751301527023315, "rewards/rejected": 0.5441275835037231, "step": 5716 }, { "epoch": 3.08, "learning_rate": 1.31457653724524e-08, "logits/chosen": -1.9769963026046753, "logits/rejected": -1.974328875541687, "logps/chosen": -0.9732743501663208, "logps/rejected": -3.696727991104126, "loss": 0.634, "rewards/accuracies": 1.0, "rewards/chosen": 0.9300228953361511, "rewards/margins": 0.12200820446014404, "rewards/rejected": 0.8080146908760071, "step": 5717 }, { "epoch": 3.08, "learning_rate": 1.3131010862750835e-08, "logits/chosen": -2.130890130996704, "logits/rejected": -2.017194986343384, "logps/chosen": -33.53205108642578, "logps/rejected": -4.217336177825928, "loss": 0.1142, "rewards/accuracies": 1.0, "rewards/chosen": 2.7550361156463623, "rewards/margins": 2.1121292114257812, "rewards/rejected": 0.6429068446159363, "step": 5718 }, { "epoch": 3.08, "learning_rate": 1.3116263386053e-08, "logits/chosen": -2.1495702266693115, "logits/rejected": -2.1532866954803467, "logps/chosen": -0.990275502204895, "logps/rejected": -4.4359822273254395, "loss": 0.4378, "rewards/accuracies": 1.0, "rewards/chosen": 1.0150704383850098, "rewards/margins": 0.5992560386657715, "rewards/rejected": 0.4158143699169159, "step": 5719 }, { "epoch": 3.09, "learning_rate": 1.3101522945172089e-08, "logits/chosen": -2.084756374359131, "logits/rejected": -2.081120014190674, "logps/chosen": -5.699283599853516, "logps/rejected": -1.6462258100509644, "loss": 0.3007, "rewards/accuracies": 1.0, "rewards/chosen": 1.8405113220214844, "rewards/margins": 1.047518014907837, "rewards/rejected": 0.7929933667182922, "step": 5720 }, { "epoch": 3.09, "learning_rate": 1.3086789542919934e-08, "logits/chosen": -2.2007486820220947, "logits/rejected": -2.317532539367676, "logps/chosen": -5.359608173370361, "logps/rejected": -30.749109268188477, "loss": 0.3631, "rewards/accuracies": 1.0, "rewards/chosen": 1.467230200767517, "rewards/margins": 0.8259183764457703, "rewards/rejected": 0.6413118243217468, "step": 5721 }, { "epoch": 3.09, "learning_rate": 1.3072063182107024e-08, "logits/chosen": -2.146649122238159, "logits/rejected": -2.3131160736083984, "logps/chosen": -0.5487746000289917, "logps/rejected": -0.4800679683685303, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.9790401458740234, "rewards/margins": 0.02896338701248169, "rewards/rejected": 0.9500767588615417, "step": 5722 }, { "epoch": 3.09, "learning_rate": 1.3057343865542514e-08, "logits/chosen": -2.0957584381103516, "logits/rejected": -2.0779318809509277, "logps/chosen": -6.680477142333984, "logps/rejected": -1.8627474308013916, "loss": 0.3377, "rewards/accuracies": 1.0, "rewards/chosen": 1.725464105606079, "rewards/margins": 0.9121204018592834, "rewards/rejected": 0.8133437037467957, "step": 5723 }, { "epoch": 3.09, "learning_rate": 1.304263159603421e-08, "logits/chosen": -2.0773284435272217, "logits/rejected": -2.07570219039917, "logps/chosen": -0.9814539551734924, "logps/rejected": -6.278600692749023, "loss": 0.3901, "rewards/accuracies": 1.0, "rewards/chosen": 1.132346272468567, "rewards/margins": 0.7401016354560852, "rewards/rejected": 0.3922446370124817, "step": 5724 }, { "epoch": 3.09, "learning_rate": 1.3027926376388577e-08, "logits/chosen": -2.149538278579712, "logits/rejected": -2.1496670246124268, "logps/chosen": -1.0032662153244019, "logps/rejected": -4.96040153503418, "loss": 0.3962, "rewards/accuracies": 1.0, "rewards/chosen": 1.132442831993103, "rewards/margins": 0.721113920211792, "rewards/rejected": 0.41132888197898865, "step": 5725 }, { "epoch": 3.09, "learning_rate": 1.3013228209410732e-08, "logits/chosen": -2.089214324951172, "logits/rejected": -2.0882766246795654, "logps/chosen": -0.7199732661247253, "logps/rejected": -1.8609427213668823, "loss": 0.6542, "rewards/accuracies": 1.0, "rewards/chosen": 0.9317540526390076, "rewards/margins": 0.07937157154083252, "rewards/rejected": 0.852382481098175, "step": 5726 }, { "epoch": 3.09, "learning_rate": 1.2998537097904445e-08, "logits/chosen": -2.1193392276763916, "logits/rejected": -2.2299842834472656, "logps/chosen": -1.9285194873809814, "logps/rejected": -1.7540837526321411, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.7903333902359009, "rewards/margins": 0.015701234340667725, "rewards/rejected": 0.7746321558952332, "step": 5727 }, { "epoch": 3.09, "learning_rate": 1.2983853044672127e-08, "logits/chosen": -2.0433080196380615, "logits/rejected": -2.303851842880249, "logps/chosen": -3.3551619052886963, "logps/rejected": -5.378249168395996, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": 0.9668492674827576, "rewards/margins": 0.248196542263031, "rewards/rejected": 0.7186527252197266, "step": 5728 }, { "epoch": 3.09, "learning_rate": 1.2969176052514908e-08, "logits/chosen": -2.09443998336792, "logits/rejected": -2.293686628341675, "logps/chosen": -1.1114271879196167, "logps/rejected": -1.0968343019485474, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 1.080834984779358, "rewards/margins": 0.029779672622680664, "rewards/rejected": 1.0510553121566772, "step": 5729 }, { "epoch": 3.09, "learning_rate": 1.2954506124232473e-08, "logits/chosen": -1.9782360792160034, "logits/rejected": -1.9761184453964233, "logps/chosen": -0.3803319036960602, "logps/rejected": -3.352494955062866, "loss": 0.5588, "rewards/accuracies": 1.0, "rewards/chosen": 0.994408905506134, "rewards/margins": 0.28960084915161133, "rewards/rejected": 0.7048080563545227, "step": 5730 }, { "epoch": 3.09, "learning_rate": 1.2939843262623224e-08, "logits/chosen": -2.1177213191986084, "logits/rejected": -2.3367912769317627, "logps/chosen": -1.3483085632324219, "logps/rejected": -1.255882740020752, "loss": 0.7063, "rewards/accuracies": 0.0, "rewards/chosen": 0.924407958984375, "rewards/margins": -0.026153743267059326, "rewards/rejected": 0.9505617022514343, "step": 5731 }, { "epoch": 3.09, "learning_rate": 1.29251874704842e-08, "logits/chosen": -2.0894298553466797, "logits/rejected": -2.3132948875427246, "logps/chosen": -1.057564616203308, "logps/rejected": -1.0717612504959106, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.7962385416030884, "rewards/margins": 0.003507554531097412, "rewards/rejected": 0.792730987071991, "step": 5732 }, { "epoch": 3.09, "learning_rate": 1.2910538750611095e-08, "logits/chosen": -2.0469858646392822, "logits/rejected": -2.2943711280822754, "logps/chosen": -0.2839146852493286, "logps/rejected": -0.2371293306350708, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.8620050549507141, "rewards/margins": 0.0006077885627746582, "rewards/rejected": 0.8613972663879395, "step": 5733 }, { "epoch": 3.09, "learning_rate": 1.2895897105798237e-08, "logits/chosen": -2.1927521228790283, "logits/rejected": -2.228726625442505, "logps/chosen": -10.2923583984375, "logps/rejected": -12.082064628601074, "loss": 0.392, "rewards/accuracies": 1.0, "rewards/chosen": 1.4774751663208008, "rewards/margins": 0.73405921459198, "rewards/rejected": 0.7434159517288208, "step": 5734 }, { "epoch": 3.09, "learning_rate": 1.2881262538838622e-08, "logits/chosen": -2.1435699462890625, "logits/rejected": -2.140103340148926, "logps/chosen": -5.440501689910889, "logps/rejected": -3.1376898288726807, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 1.4598811864852905, "rewards/margins": 0.7893909215927124, "rewards/rejected": 0.6704902648925781, "step": 5735 }, { "epoch": 3.09, "learning_rate": 1.2866635052523899e-08, "logits/chosen": -2.3026089668273926, "logits/rejected": -2.310868740081787, "logps/chosen": -2.6278812885284424, "logps/rejected": -5.409647464752197, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 1.5327162742614746, "rewards/margins": 0.9921666383743286, "rewards/rejected": 0.540549635887146, "step": 5736 }, { "epoch": 3.09, "learning_rate": 1.2852014649644344e-08, "logits/chosen": -2.1675832271575928, "logits/rejected": -2.167668581008911, "logps/chosen": -2.265626907348633, "logps/rejected": -3.4114747047424316, "loss": 0.4081, "rewards/accuracies": 1.0, "rewards/chosen": 1.493168830871582, "rewards/margins": 0.6854034066200256, "rewards/rejected": 0.8077654242515564, "step": 5737 }, { "epoch": 3.09, "learning_rate": 1.2837401332988906e-08, "logits/chosen": -2.0204973220825195, "logits/rejected": -2.111522674560547, "logps/chosen": -11.387105941772461, "logps/rejected": -16.18772315979004, "loss": 0.4121, "rewards/accuracies": 1.0, "rewards/chosen": 1.7491400241851807, "rewards/margins": 0.6734514236450195, "rewards/rejected": 1.0756886005401611, "step": 5738 }, { "epoch": 3.1, "learning_rate": 1.2822795105345153e-08, "logits/chosen": -2.0085465908050537, "logits/rejected": -2.018181562423706, "logps/chosen": -5.91409969329834, "logps/rejected": -1.4803801774978638, "loss": 0.6479, "rewards/accuracies": 1.0, "rewards/chosen": 1.2417577505111694, "rewards/margins": 0.09266805648803711, "rewards/rejected": 1.1490896940231323, "step": 5739 }, { "epoch": 3.1, "learning_rate": 1.2808195969499352e-08, "logits/chosen": -2.020416259765625, "logits/rejected": -2.289726734161377, "logps/chosen": -0.5013526678085327, "logps/rejected": -0.5494232773780823, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.8529624342918396, "rewards/margins": 0.013222038745880127, "rewards/rejected": 0.8397403955459595, "step": 5740 }, { "epoch": 3.1, "learning_rate": 1.2793603928236369e-08, "logits/chosen": -2.0761427879333496, "logits/rejected": -2.2975995540618896, "logps/chosen": -1.0035252571105957, "logps/rejected": -0.8609964847564697, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 1.1330957412719727, "rewards/margins": 0.013875722885131836, "rewards/rejected": 1.1192200183868408, "step": 5741 }, { "epoch": 3.1, "learning_rate": 1.2779018984339723e-08, "logits/chosen": -1.9857192039489746, "logits/rejected": -1.983585000038147, "logps/chosen": -1.700576663017273, "logps/rejected": -5.261497974395752, "loss": 0.4673, "rewards/accuracies": 1.0, "rewards/chosen": 1.2780808210372925, "rewards/margins": 0.5181056261062622, "rewards/rejected": 0.7599751949310303, "step": 5742 }, { "epoch": 3.1, "learning_rate": 1.2764441140591603e-08, "logits/chosen": -2.1713171005249023, "logits/rejected": -2.2743778228759766, "logps/chosen": -1.092167854309082, "logps/rejected": -1.0483019351959229, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.9311243891716003, "rewards/margins": 0.028441131114959717, "rewards/rejected": 0.9026832580566406, "step": 5743 }, { "epoch": 3.1, "learning_rate": 1.2749870399772822e-08, "logits/chosen": -2.111483335494995, "logits/rejected": -2.1164603233337402, "logps/chosen": -2.0675437450408936, "logps/rejected": -3.5424816608428955, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 1.040213942527771, "rewards/margins": 0.5235674381256104, "rewards/rejected": 0.5166465044021606, "step": 5744 }, { "epoch": 3.1, "learning_rate": 1.2735306764662846e-08, "logits/chosen": -2.0609238147735596, "logits/rejected": -2.0571465492248535, "logps/chosen": -6.195301055908203, "logps/rejected": -5.754384517669678, "loss": 0.2338, "rewards/accuracies": 1.0, "rewards/chosen": 1.6902179718017578, "rewards/margins": 1.334099292755127, "rewards/rejected": 0.35611864924430847, "step": 5745 }, { "epoch": 3.1, "learning_rate": 1.2720750238039801e-08, "logits/chosen": -2.0896644592285156, "logits/rejected": -2.097480058670044, "logps/chosen": -4.928266525268555, "logps/rejected": -3.427128791809082, "loss": 0.4859, "rewards/accuracies": 1.0, "rewards/chosen": 1.0629470348358154, "rewards/margins": 0.4690392017364502, "rewards/rejected": 0.5939078330993652, "step": 5746 }, { "epoch": 3.1, "learning_rate": 1.2706200822680413e-08, "logits/chosen": -2.10495662689209, "logits/rejected": -2.247640371322632, "logps/chosen": -0.2872212529182434, "logps/rejected": -0.2897447347640991, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9666479229927063, "rewards/margins": 0.009391307830810547, "rewards/rejected": 0.9572566151618958, "step": 5747 }, { "epoch": 3.1, "learning_rate": 1.2691658521360082e-08, "logits/chosen": -2.06573224067688, "logits/rejected": -2.0619165897369385, "logps/chosen": -0.5434303283691406, "logps/rejected": -6.247645378112793, "loss": 0.3077, "rewards/accuracies": 1.0, "rewards/chosen": 1.366020917892456, "rewards/margins": 1.0207905769348145, "rewards/rejected": 0.3452303111553192, "step": 5748 }, { "epoch": 3.1, "learning_rate": 1.2677123336852868e-08, "logits/chosen": -2.198045492172241, "logits/rejected": -2.302419424057007, "logps/chosen": -0.20449519157409668, "logps/rejected": -0.21154046058654785, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.8882754445075989, "rewards/margins": 0.0039116740226745605, "rewards/rejected": 0.8843637704849243, "step": 5749 }, { "epoch": 3.1, "learning_rate": 1.266259527193142e-08, "logits/chosen": -2.081644296646118, "logits/rejected": -2.3554272651672363, "logps/chosen": -0.1784319132566452, "logps/rejected": -0.1846083551645279, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.8078330159187317, "rewards/margins": 0.007072150707244873, "rewards/rejected": 0.8007608652114868, "step": 5750 }, { "epoch": 3.1, "learning_rate": 1.2648074329367103e-08, "logits/chosen": -2.1960294246673584, "logits/rejected": -2.1917619705200195, "logps/chosen": -6.5663371086120605, "logps/rejected": -5.234905242919922, "loss": 0.3555, "rewards/accuracies": 1.0, "rewards/chosen": 1.280050277709961, "rewards/margins": 0.8513094782829285, "rewards/rejected": 0.42874079942703247, "step": 5751 }, { "epoch": 3.1, "learning_rate": 1.263356051192987e-08, "logits/chosen": -2.0864226818084717, "logits/rejected": -2.274226427078247, "logps/chosen": -6.6011786460876465, "logps/rejected": -0.6182770133018494, "loss": 0.7224, "rewards/accuracies": 0.0, "rewards/chosen": 0.8320876955986023, "rewards/margins": -0.05772620439529419, "rewards/rejected": 0.8898138999938965, "step": 5752 }, { "epoch": 3.1, "learning_rate": 1.2619053822388321e-08, "logits/chosen": -2.1203606128692627, "logits/rejected": -2.1280629634857178, "logps/chosen": -1.651430368423462, "logps/rejected": -2.87605881690979, "loss": 0.4578, "rewards/accuracies": 1.0, "rewards/chosen": 1.2184146642684937, "rewards/margins": 0.5436861515045166, "rewards/rejected": 0.674728512763977, "step": 5753 }, { "epoch": 3.1, "learning_rate": 1.2604554263509703e-08, "logits/chosen": -2.069880485534668, "logits/rejected": -2.2135636806488037, "logps/chosen": -0.9315598011016846, "logps/rejected": -1.2867512702941895, "loss": 0.7164, "rewards/accuracies": 0.0, "rewards/chosen": 0.8917115330696106, "rewards/margins": -0.045888423919677734, "rewards/rejected": 0.9375999569892883, "step": 5754 }, { "epoch": 3.1, "learning_rate": 1.2590061838059907e-08, "logits/chosen": -2.0612058639526367, "logits/rejected": -2.066289186477661, "logps/chosen": -9.61242961883545, "logps/rejected": -4.321094512939453, "loss": 0.6018, "rewards/accuracies": 1.0, "rewards/chosen": 0.9981858134269714, "rewards/margins": 0.1918211579322815, "rewards/rejected": 0.8063646554946899, "step": 5755 }, { "epoch": 3.1, "learning_rate": 1.2575576548803458e-08, "logits/chosen": -2.1343984603881836, "logits/rejected": -2.1224913597106934, "logps/chosen": -7.350191593170166, "logps/rejected": -4.545601844787598, "loss": 0.3843, "rewards/accuracies": 1.0, "rewards/chosen": 1.208732008934021, "rewards/margins": 0.7580336332321167, "rewards/rejected": 0.4506983757019043, "step": 5756 }, { "epoch": 3.11, "learning_rate": 1.2561098398503517e-08, "logits/chosen": -2.200432777404785, "logits/rejected": -2.112945079803467, "logps/chosen": -31.735225677490234, "logps/rejected": -5.581056118011475, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 2.1056315898895264, "rewards/margins": 1.770466923713684, "rewards/rejected": 0.3351646959781647, "step": 5757 }, { "epoch": 3.11, "learning_rate": 1.2546627389921893e-08, "logits/chosen": -2.206480026245117, "logits/rejected": -2.191258668899536, "logps/chosen": -2.706071138381958, "logps/rejected": -10.091825485229492, "loss": 0.3431, "rewards/accuracies": 1.0, "rewards/chosen": 1.255714774131775, "rewards/margins": 0.8933504819869995, "rewards/rejected": 0.3623642921447754, "step": 5758 }, { "epoch": 3.11, "learning_rate": 1.2532163525819024e-08, "logits/chosen": -2.0027191638946533, "logits/rejected": -2.3042924404144287, "logps/chosen": -0.26223307847976685, "logps/rejected": -0.32203131914138794, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9748848080635071, "rewards/margins": 0.0057541728019714355, "rewards/rejected": 0.9691306352615356, "step": 5759 }, { "epoch": 3.11, "learning_rate": 1.2517706808953981e-08, "logits/chosen": -2.1296725273132324, "logits/rejected": -2.1400513648986816, "logps/chosen": -1.548933744430542, "logps/rejected": -2.285557508468628, "loss": 0.4821, "rewards/accuracies": 1.0, "rewards/chosen": 1.1971882581710815, "rewards/margins": 0.4788038730621338, "rewards/rejected": 0.7183843851089478, "step": 5760 }, { "epoch": 3.11, "learning_rate": 1.2503257242084492e-08, "logits/chosen": -2.118762493133545, "logits/rejected": -2.11466646194458, "logps/chosen": -3.1001648902893066, "logps/rejected": -2.7225465774536133, "loss": 0.5004, "rewards/accuracies": 1.0, "rewards/chosen": 1.179382562637329, "rewards/margins": 0.4318058490753174, "rewards/rejected": 0.7475767135620117, "step": 5761 }, { "epoch": 3.11, "learning_rate": 1.2488814827966904e-08, "logits/chosen": -2.125593423843384, "logits/rejected": -2.3703298568725586, "logps/chosen": -0.4433137774467468, "logps/rejected": -0.5347281694412231, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.9227399826049805, "rewards/margins": 0.009858965873718262, "rewards/rejected": 0.9128810167312622, "step": 5762 }, { "epoch": 3.11, "learning_rate": 1.2474379569356191e-08, "logits/chosen": -2.1124155521392822, "logits/rejected": -2.346928119659424, "logps/chosen": -5.515757083892822, "logps/rejected": -0.22828984260559082, "loss": 0.5529, "rewards/accuracies": 1.0, "rewards/chosen": 1.298815131187439, "rewards/margins": 0.3033961057662964, "rewards/rejected": 0.9954190254211426, "step": 5763 }, { "epoch": 3.11, "learning_rate": 1.2459951469005986e-08, "logits/chosen": -2.1032187938690186, "logits/rejected": -2.285637140274048, "logps/chosen": -2.5674080848693848, "logps/rejected": -2.5708200931549072, "loss": 0.6716, "rewards/accuracies": 1.0, "rewards/chosen": 0.7080442309379578, "rewards/margins": 0.043491899967193604, "rewards/rejected": 0.6645523309707642, "step": 5764 }, { "epoch": 3.11, "learning_rate": 1.2445530529668547e-08, "logits/chosen": -2.064117193222046, "logits/rejected": -2.3046109676361084, "logps/chosen": -0.2667936384677887, "logps/rejected": -0.30848708748817444, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.9807260632514954, "rewards/margins": -0.0046520233154296875, "rewards/rejected": 0.985378086566925, "step": 5765 }, { "epoch": 3.11, "learning_rate": 1.243111675409475e-08, "logits/chosen": -2.0838210582733154, "logits/rejected": -2.085763454437256, "logps/chosen": -3.366013765335083, "logps/rejected": -3.3231699466705322, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 1.7372850179672241, "rewards/margins": 1.0296485424041748, "rewards/rejected": 0.7076364755630493, "step": 5766 }, { "epoch": 3.11, "learning_rate": 1.2416710145034131e-08, "logits/chosen": -2.1116509437561035, "logits/rejected": -2.1214115619659424, "logps/chosen": -2.6124401092529297, "logps/rejected": -3.2230417728424072, "loss": 0.4554, "rewards/accuracies": 1.0, "rewards/chosen": 1.023047685623169, "rewards/margins": 0.5502791404724121, "rewards/rejected": 0.47276851534843445, "step": 5767 }, { "epoch": 3.11, "learning_rate": 1.2402310705234841e-08, "logits/chosen": -2.029041290283203, "logits/rejected": -2.000133991241455, "logps/chosen": -7.669888496398926, "logps/rejected": -6.040561676025391, "loss": 0.3705, "rewards/accuracies": 1.0, "rewards/chosen": 1.4927769899368286, "rewards/margins": 0.8020092844963074, "rewards/rejected": 0.6907677054405212, "step": 5768 }, { "epoch": 3.11, "learning_rate": 1.2387918437443667e-08, "logits/chosen": -2.0994091033935547, "logits/rejected": -2.2418346405029297, "logps/chosen": -0.315714955329895, "logps/rejected": -0.267616331577301, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.9018601775169373, "rewards/margins": 0.014108777046203613, "rewards/rejected": 0.8877514004707336, "step": 5769 }, { "epoch": 3.11, "learning_rate": 1.2373533344406034e-08, "logits/chosen": -2.0592026710510254, "logits/rejected": -2.3278262615203857, "logps/chosen": -0.2584080398082733, "logps/rejected": -0.25826385617256165, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8106660842895508, "rewards/margins": 0.02280336618423462, "rewards/rejected": 0.7878627181053162, "step": 5770 }, { "epoch": 3.11, "learning_rate": 1.2359155428865986e-08, "logits/chosen": -1.963868498802185, "logits/rejected": -2.261021137237549, "logps/chosen": -2.549424886703491, "logps/rejected": -3.5636754035949707, "loss": 0.6353, "rewards/accuracies": 1.0, "rewards/chosen": 1.0958682298660278, "rewards/margins": 0.11917906999588013, "rewards/rejected": 0.9766891598701477, "step": 5771 }, { "epoch": 3.11, "learning_rate": 1.2344784693566219e-08, "logits/chosen": -2.0847392082214355, "logits/rejected": -2.26532244682312, "logps/chosen": -0.35654518008232117, "logps/rejected": -0.4324481785297394, "loss": 0.7049, "rewards/accuracies": 0.0, "rewards/chosen": 1.0402034521102905, "rewards/margins": -0.023302435874938965, "rewards/rejected": 1.0635058879852295, "step": 5772 }, { "epoch": 3.11, "learning_rate": 1.2330421141248015e-08, "logits/chosen": -2.163346767425537, "logits/rejected": -2.1769754886627197, "logps/chosen": -3.4323506355285645, "logps/rejected": -2.3161697387695312, "loss": 0.6314, "rewards/accuracies": 1.0, "rewards/chosen": 1.2795127630233765, "rewards/margins": 0.12749063968658447, "rewards/rejected": 1.152022123336792, "step": 5773 }, { "epoch": 3.11, "learning_rate": 1.2316064774651353e-08, "logits/chosen": -2.093830108642578, "logits/rejected": -2.276557683944702, "logps/chosen": -0.6543324589729309, "logps/rejected": -0.6473572254180908, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.8494941592216492, "rewards/margins": 0.008822321891784668, "rewards/rejected": 0.8406718373298645, "step": 5774 }, { "epoch": 3.11, "learning_rate": 1.230171559651481e-08, "logits/chosen": -2.1415610313415527, "logits/rejected": -2.0829832553863525, "logps/chosen": -15.36197280883789, "logps/rejected": -4.171833515167236, "loss": 0.2315, "rewards/accuracies": 1.0, "rewards/chosen": 1.888386607170105, "rewards/margins": 1.345424771308899, "rewards/rejected": 0.542961835861206, "step": 5775 }, { "epoch": 3.12, "learning_rate": 1.2287373609575551e-08, "logits/chosen": -2.19183349609375, "logits/rejected": -2.3574042320251465, "logps/chosen": -0.3301602303981781, "logps/rejected": -0.4091557264328003, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9590086340904236, "rewards/margins": 0.004671692848205566, "rewards/rejected": 0.954336941242218, "step": 5776 }, { "epoch": 3.12, "learning_rate": 1.227303881656942e-08, "logits/chosen": -2.1623024940490723, "logits/rejected": -2.160951614379883, "logps/chosen": -2.656545400619507, "logps/rejected": -3.0358002185821533, "loss": 0.4013, "rewards/accuracies": 1.0, "rewards/chosen": 1.4400551319122314, "rewards/margins": 0.7057667374610901, "rewards/rejected": 0.7342883944511414, "step": 5777 }, { "epoch": 3.12, "learning_rate": 1.2258711220230877e-08, "logits/chosen": -2.0191566944122314, "logits/rejected": -2.0117907524108887, "logps/chosen": -2.7384274005889893, "logps/rejected": -6.658547401428223, "loss": 0.3976, "rewards/accuracies": 1.0, "rewards/chosen": 0.9261876344680786, "rewards/margins": 0.7169864177703857, "rewards/rejected": 0.20920124650001526, "step": 5778 }, { "epoch": 3.12, "learning_rate": 1.2244390823293005e-08, "logits/chosen": -2.0339813232421875, "logits/rejected": -2.2835333347320557, "logps/chosen": -0.563802182674408, "logps/rejected": -0.662015438079834, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 1.0150580406188965, "rewards/margins": 0.008812785148620605, "rewards/rejected": 1.0062452554702759, "step": 5779 }, { "epoch": 3.12, "learning_rate": 1.2230077628487517e-08, "logits/chosen": -2.065589427947998, "logits/rejected": -2.0733602046966553, "logps/chosen": -0.7813801765441895, "logps/rejected": -2.67759370803833, "loss": 0.4615, "rewards/accuracies": 1.0, "rewards/chosen": 1.1084880828857422, "rewards/margins": 0.5337805151939392, "rewards/rejected": 0.574707567691803, "step": 5780 }, { "epoch": 3.12, "learning_rate": 1.2215771638544742e-08, "logits/chosen": -2.045761823654175, "logits/rejected": -2.3269712924957275, "logps/chosen": -0.2045867145061493, "logps/rejected": -0.19795647263526917, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.8887774348258972, "rewards/margins": -0.005392551422119141, "rewards/rejected": 0.8941699862480164, "step": 5781 }, { "epoch": 3.12, "learning_rate": 1.2201472856193646e-08, "logits/chosen": -1.9384450912475586, "logits/rejected": -2.217315196990967, "logps/chosen": -0.31189870834350586, "logps/rejected": -0.26498767733573914, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.9561559557914734, "rewards/margins": -0.0037261247634887695, "rewards/rejected": 0.9598820805549622, "step": 5782 }, { "epoch": 3.12, "learning_rate": 1.2187181284161819e-08, "logits/chosen": -2.097046136856079, "logits/rejected": -2.3241138458251953, "logps/chosen": -0.22340986132621765, "logps/rejected": -0.19844895601272583, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.8871399164199829, "rewards/margins": 0.0393671989440918, "rewards/rejected": 0.8477727174758911, "step": 5783 }, { "epoch": 3.12, "learning_rate": 1.2172896925175451e-08, "logits/chosen": -2.119276285171509, "logits/rejected": -2.1280605792999268, "logps/chosen": -1.1429870128631592, "logps/rejected": -4.3814897537231445, "loss": 0.408, "rewards/accuracies": 1.0, "rewards/chosen": 1.114515781402588, "rewards/margins": 0.6855669021606445, "rewards/rejected": 0.42894887924194336, "step": 5784 }, { "epoch": 3.12, "learning_rate": 1.2158619781959417e-08, "logits/chosen": -2.0270206928253174, "logits/rejected": -2.3132622241973877, "logps/chosen": -0.1714089810848236, "logps/rejected": -0.1929519772529602, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.9962412118911743, "rewards/margins": 0.04635554552078247, "rewards/rejected": 0.9498856663703918, "step": 5785 }, { "epoch": 3.12, "learning_rate": 1.2144349857237158e-08, "logits/chosen": -2.1530160903930664, "logits/rejected": -2.1528191566467285, "logps/chosen": -0.6610692143440247, "logps/rejected": -2.4015748500823975, "loss": 0.5579, "rewards/accuracies": 1.0, "rewards/chosen": 1.1047505140304565, "rewards/margins": 0.2915734052658081, "rewards/rejected": 0.8131771087646484, "step": 5786 }, { "epoch": 3.12, "learning_rate": 1.2130087153730756e-08, "logits/chosen": -2.044610023498535, "logits/rejected": -2.3069913387298584, "logps/chosen": -0.23085807263851166, "logps/rejected": -0.21943709254264832, "loss": 0.6682, "rewards/accuracies": 1.0, "rewards/chosen": 1.0044898986816406, "rewards/margins": 0.05055016279220581, "rewards/rejected": 0.9539397358894348, "step": 5787 }, { "epoch": 3.12, "learning_rate": 1.2115831674160914e-08, "logits/chosen": -2.020505428314209, "logits/rejected": -2.020803213119507, "logps/chosen": -2.9472591876983643, "logps/rejected": -6.505153179168701, "loss": 0.2413, "rewards/accuracies": 1.0, "rewards/chosen": 1.5625271797180176, "rewards/margins": 1.2987949848175049, "rewards/rejected": 0.2637321949005127, "step": 5788 }, { "epoch": 3.12, "learning_rate": 1.2101583421246964e-08, "logits/chosen": -2.0838024616241455, "logits/rejected": -2.2756760120391846, "logps/chosen": -1.2713396549224854, "logps/rejected": -1.3721966743469238, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 0.8818243145942688, "rewards/margins": 0.03852808475494385, "rewards/rejected": 0.843296229839325, "step": 5789 }, { "epoch": 3.12, "learning_rate": 1.2087342397706851e-08, "logits/chosen": -2.067180871963501, "logits/rejected": -2.3012218475341797, "logps/chosen": -1.0152329206466675, "logps/rejected": -1.1371996402740479, "loss": 0.7008, "rewards/accuracies": 0.0, "rewards/chosen": 1.0090842247009277, "rewards/margins": -0.01524198055267334, "rewards/rejected": 1.024326205253601, "step": 5790 }, { "epoch": 3.12, "learning_rate": 1.2073108606257148e-08, "logits/chosen": -2.042985200881958, "logits/rejected": -2.262847423553467, "logps/chosen": -0.32322344183921814, "logps/rejected": -0.33951595425605774, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9560750126838684, "rewards/margins": 0.005085110664367676, "rewards/rejected": 0.9509899020195007, "step": 5791 }, { "epoch": 3.12, "learning_rate": 1.205888204961305e-08, "logits/chosen": -2.0480093955993652, "logits/rejected": -2.2902708053588867, "logps/chosen": -0.2980961799621582, "logps/rejected": -0.2875314950942993, "loss": 0.6716, "rewards/accuracies": 1.0, "rewards/chosen": 0.8941823840141296, "rewards/margins": 0.043629348278045654, "rewards/rejected": 0.850553035736084, "step": 5792 }, { "epoch": 3.12, "learning_rate": 1.2044662730488353e-08, "logits/chosen": -2.1203453540802, "logits/rejected": -2.276273488998413, "logps/chosen": -1.201553225517273, "logps/rejected": -1.3516528606414795, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 1.0492841005325317, "rewards/margins": 0.02757120132446289, "rewards/rejected": 1.0217128992080688, "step": 5793 }, { "epoch": 3.13, "learning_rate": 1.2030450651595486e-08, "logits/chosen": -2.110630512237549, "logits/rejected": -2.105689287185669, "logps/chosen": -6.769820690155029, "logps/rejected": -4.699559688568115, "loss": 0.2457, "rewards/accuracies": 1.0, "rewards/chosen": 1.7261120080947876, "rewards/margins": 1.2781213521957397, "rewards/rejected": 0.44799065589904785, "step": 5794 }, { "epoch": 3.13, "learning_rate": 1.2016245815645493e-08, "logits/chosen": -2.1240665912628174, "logits/rejected": -2.106785535812378, "logps/chosen": -13.545235633850098, "logps/rejected": -2.3763678073883057, "loss": 0.3204, "rewards/accuracies": 1.0, "rewards/chosen": 1.4288899898529053, "rewards/margins": 0.9737547636032104, "rewards/rejected": 0.4551352560520172, "step": 5795 }, { "epoch": 3.13, "learning_rate": 1.2002048225348066e-08, "logits/chosen": -2.0825917720794678, "logits/rejected": -2.3593432903289795, "logps/chosen": -1.3283178806304932, "logps/rejected": -14.306747436523438, "loss": 0.4955, "rewards/accuracies": 1.0, "rewards/chosen": 1.023056149482727, "rewards/margins": 0.4443039894104004, "rewards/rejected": 0.5787521600723267, "step": 5796 }, { "epoch": 3.13, "learning_rate": 1.1987857883411472e-08, "logits/chosen": -2.1062557697296143, "logits/rejected": -2.2408719062805176, "logps/chosen": -3.3699936866760254, "logps/rejected": -3.3813695907592773, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.615660548210144, "rewards/margins": 0.04264014959335327, "rewards/rejected": 0.5730203986167908, "step": 5797 }, { "epoch": 3.13, "learning_rate": 1.1973674792542615e-08, "logits/chosen": -2.0403401851654053, "logits/rejected": -2.329423666000366, "logps/chosen": -5.792184352874756, "logps/rejected": -5.002168655395508, "loss": 0.7174, "rewards/accuracies": 0.0, "rewards/chosen": 0.671673059463501, "rewards/margins": -0.04785048961639404, "rewards/rejected": 0.719523549079895, "step": 5798 }, { "epoch": 3.13, "learning_rate": 1.1959498955447012e-08, "logits/chosen": -1.967692494392395, "logits/rejected": -2.2894980907440186, "logps/chosen": -2.992671251296997, "logps/rejected": -3.344959020614624, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.7333032488822937, "rewards/margins": 0.005778372287750244, "rewards/rejected": 0.7275248765945435, "step": 5799 }, { "epoch": 3.13, "learning_rate": 1.1945330374828804e-08, "logits/chosen": -2.1736457347869873, "logits/rejected": -2.1748714447021484, "logps/chosen": -3.5485141277313232, "logps/rejected": -3.7655961513519287, "loss": 0.4739, "rewards/accuracies": 1.0, "rewards/chosen": 1.0807021856307983, "rewards/margins": 0.5004578232765198, "rewards/rejected": 0.5802443623542786, "step": 5800 }, { "epoch": 3.13, "learning_rate": 1.1931169053390732e-08, "logits/chosen": -2.011559247970581, "logits/rejected": -2.264037609100342, "logps/chosen": -0.711851954460144, "logps/rejected": -0.5720542073249817, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 1.0578728914260864, "rewards/margins": 0.008298754692077637, "rewards/rejected": 1.0495741367340088, "step": 5801 }, { "epoch": 3.13, "learning_rate": 1.1917014993834174e-08, "logits/chosen": -2.068246841430664, "logits/rejected": -2.0536625385284424, "logps/chosen": -6.090823173522949, "logps/rejected": -3.18827486038208, "loss": 0.3954, "rewards/accuracies": 1.0, "rewards/chosen": 1.4986733198165894, "rewards/margins": 0.7236958742141724, "rewards/rejected": 0.774977445602417, "step": 5802 }, { "epoch": 3.13, "learning_rate": 1.1902868198859095e-08, "logits/chosen": -2.1146132946014404, "logits/rejected": -2.112581253051758, "logps/chosen": -1.6091340780258179, "logps/rejected": -10.094990730285645, "loss": 0.3418, "rewards/accuracies": 1.0, "rewards/chosen": 0.9617501497268677, "rewards/margins": 0.8978954553604126, "rewards/rejected": 0.06385469436645508, "step": 5803 }, { "epoch": 3.13, "learning_rate": 1.18887286711641e-08, "logits/chosen": -2.026116132736206, "logits/rejected": -2.3024637699127197, "logps/chosen": -0.6031188368797302, "logps/rejected": -0.6604008674621582, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.8591556549072266, "rewards/margins": 0.026125311851501465, "rewards/rejected": 0.8330303430557251, "step": 5804 }, { "epoch": 3.13, "learning_rate": 1.1874596413446397e-08, "logits/chosen": -1.971579670906067, "logits/rejected": -2.23502516746521, "logps/chosen": -0.12615713477134705, "logps/rejected": -0.14524520933628082, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9523888826370239, "rewards/margins": 0.007741391658782959, "rewards/rejected": 0.944647490978241, "step": 5805 }, { "epoch": 3.13, "learning_rate": 1.1860471428401809e-08, "logits/chosen": -2.1841869354248047, "logits/rejected": -2.1478664875030518, "logps/chosen": -23.229873657226562, "logps/rejected": -11.697467803955078, "loss": 0.2947, "rewards/accuracies": 1.0, "rewards/chosen": 2.169973134994507, "rewards/margins": 1.0706546306610107, "rewards/rejected": 1.099318504333496, "step": 5806 }, { "epoch": 3.13, "learning_rate": 1.1846353718724766e-08, "logits/chosen": -2.0746405124664307, "logits/rejected": -2.065058708190918, "logps/chosen": -3.8413405418395996, "logps/rejected": -7.9935622215271, "loss": 0.304, "rewards/accuracies": 1.0, "rewards/chosen": 1.2476091384887695, "rewards/margins": 1.0348554849624634, "rewards/rejected": 0.21275363862514496, "step": 5807 }, { "epoch": 3.13, "learning_rate": 1.1832243287108312e-08, "logits/chosen": -2.021819829940796, "logits/rejected": -2.227250814437866, "logps/chosen": -0.31248265504837036, "logps/rejected": -0.3611290454864502, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 1.0216964483261108, "rewards/margins": 0.007853031158447266, "rewards/rejected": 1.0138434171676636, "step": 5808 }, { "epoch": 3.13, "learning_rate": 1.1818140136244115e-08, "logits/chosen": -2.0650038719177246, "logits/rejected": -2.3225903511047363, "logps/chosen": -0.3068876266479492, "logps/rejected": -0.33592402935028076, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.8920022249221802, "rewards/margins": 0.014583289623260498, "rewards/rejected": 0.8774189352989197, "step": 5809 }, { "epoch": 3.13, "learning_rate": 1.1804044268822433e-08, "logits/chosen": -2.1625332832336426, "logits/rejected": -2.2173733711242676, "logps/chosen": -10.254776954650879, "logps/rejected": -21.003963470458984, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": 1.810742974281311, "rewards/margins": 1.0184566974639893, "rewards/rejected": 0.7922863364219666, "step": 5810 }, { "epoch": 3.13, "learning_rate": 1.1789955687532156e-08, "logits/chosen": -1.992973804473877, "logits/rejected": -1.9928858280181885, "logps/chosen": -0.4517468214035034, "logps/rejected": -2.7103195190429688, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": 1.0096017122268677, "rewards/margins": 0.22152179479599, "rewards/rejected": 0.7880799174308777, "step": 5811 }, { "epoch": 3.13, "learning_rate": 1.177587439506077e-08, "logits/chosen": -2.1488699913024902, "logits/rejected": -2.1373188495635986, "logps/chosen": -3.9350225925445557, "logps/rejected": -3.027244806289673, "loss": 0.333, "rewards/accuracies": 1.0, "rewards/chosen": 1.701912760734558, "rewards/margins": 0.9284477829933167, "rewards/rejected": 0.7734649777412415, "step": 5812 }, { "epoch": 3.14, "learning_rate": 1.1761800394094373e-08, "logits/chosen": -2.0388996601104736, "logits/rejected": -2.273796319961548, "logps/chosen": -1.2071748971939087, "logps/rejected": -1.2325055599212646, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 0.911965548992157, "rewards/margins": -0.00989830493927002, "rewards/rejected": 0.921863853931427, "step": 5813 }, { "epoch": 3.14, "learning_rate": 1.1747733687317678e-08, "logits/chosen": -2.0374693870544434, "logits/rejected": -2.299753427505493, "logps/chosen": -0.4367872178554535, "logps/rejected": -0.517305314540863, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.7939563393592834, "rewards/margins": 0.04277080297470093, "rewards/rejected": 0.7511855363845825, "step": 5814 }, { "epoch": 3.14, "learning_rate": 1.1733674277413997e-08, "logits/chosen": -2.131030559539795, "logits/rejected": -2.255411386489868, "logps/chosen": -1.323253870010376, "logps/rejected": -1.3424087762832642, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.9356077313423157, "rewards/margins": -0.0038806796073913574, "rewards/rejected": 0.939488410949707, "step": 5815 }, { "epoch": 3.14, "learning_rate": 1.1719622167065263e-08, "logits/chosen": -2.027461051940918, "logits/rejected": -2.2543885707855225, "logps/chosen": -0.28687039017677307, "logps/rejected": -0.2917575240135193, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 0.8795686960220337, "rewards/margins": -0.016517817974090576, "rewards/rejected": 0.8960865139961243, "step": 5816 }, { "epoch": 3.14, "learning_rate": 1.1705577358951984e-08, "logits/chosen": -2.0406911373138428, "logits/rejected": -2.2876124382019043, "logps/chosen": -4.4969892501831055, "logps/rejected": -1.1571112871170044, "loss": 0.7518, "rewards/accuracies": 0.0, "rewards/chosen": 0.8050743937492371, "rewards/margins": -0.11405813694000244, "rewards/rejected": 0.9191325306892395, "step": 5817 }, { "epoch": 3.14, "learning_rate": 1.169153985575334e-08, "logits/chosen": -2.0748343467712402, "logits/rejected": -2.0813512802124023, "logps/chosen": -0.3307907283306122, "logps/rejected": -7.044478893280029, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": 0.9894998669624329, "rewards/margins": 0.7290202379226685, "rewards/rejected": 0.260479599237442, "step": 5818 }, { "epoch": 3.14, "learning_rate": 1.1677509660147056e-08, "logits/chosen": -2.0445454120635986, "logits/rejected": -2.0372393131256104, "logps/chosen": -2.7603390216827393, "logps/rejected": -6.685699939727783, "loss": 0.3318, "rewards/accuracies": 1.0, "rewards/chosen": 1.2302415370941162, "rewards/margins": 0.932693362236023, "rewards/rejected": 0.2975481450557709, "step": 5819 }, { "epoch": 3.14, "learning_rate": 1.166348677480949e-08, "logits/chosen": -2.158132553100586, "logits/rejected": -2.151787042617798, "logps/chosen": -2.3898637294769287, "logps/rejected": -6.265920639038086, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 1.3723795413970947, "rewards/margins": 1.0829176902770996, "rewards/rejected": 0.2894619107246399, "step": 5820 }, { "epoch": 3.14, "learning_rate": 1.1649471202415594e-08, "logits/chosen": -2.030392646789551, "logits/rejected": -2.307345390319824, "logps/chosen": -0.16199880838394165, "logps/rejected": -0.17797429859638214, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9525747299194336, "rewards/margins": 0.010237693786621094, "rewards/rejected": 0.9423370361328125, "step": 5821 }, { "epoch": 3.14, "learning_rate": 1.1635462945638952e-08, "logits/chosen": -2.0558714866638184, "logits/rejected": -2.2999274730682373, "logps/chosen": -3.1274218559265137, "logps/rejected": -2.8806440830230713, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8388633728027344, "rewards/margins": 0.01587921380996704, "rewards/rejected": 0.8229841589927673, "step": 5822 }, { "epoch": 3.14, "learning_rate": 1.1621462007151695e-08, "logits/chosen": -2.091824769973755, "logits/rejected": -2.2877092361450195, "logps/chosen": -0.10247586667537689, "logps/rejected": -0.09785138815641403, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.8782498240470886, "rewards/margins": 0.03053337335586548, "rewards/rejected": 0.8477164506912231, "step": 5823 }, { "epoch": 3.14, "learning_rate": 1.1607468389624619e-08, "logits/chosen": -2.04178524017334, "logits/rejected": -2.2977051734924316, "logps/chosen": -0.4252271056175232, "logps/rejected": -0.47790995240211487, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.9713137745857239, "rewards/margins": 0.02150803804397583, "rewards/rejected": 0.949805736541748, "step": 5824 }, { "epoch": 3.14, "learning_rate": 1.1593482095727092e-08, "logits/chosen": -2.0841190814971924, "logits/rejected": -2.3169543743133545, "logps/chosen": -5.760383605957031, "logps/rejected": -6.1198530197143555, "loss": 0.6563, "rewards/accuracies": 1.0, "rewards/chosen": 0.8879653811454773, "rewards/margins": 0.0751219391822815, "rewards/rejected": 0.8128434419631958, "step": 5825 }, { "epoch": 3.14, "learning_rate": 1.1579503128127094e-08, "logits/chosen": -2.1923141479492188, "logits/rejected": -2.344559907913208, "logps/chosen": -0.22066611051559448, "logps/rejected": -0.1921093612909317, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.914903461933136, "rewards/margins": 0.008297443389892578, "rewards/rejected": 0.9066060185432434, "step": 5826 }, { "epoch": 3.14, "learning_rate": 1.1565531489491203e-08, "logits/chosen": -2.0028834342956543, "logits/rejected": -2.305591344833374, "logps/chosen": -0.38869521021842957, "logps/rejected": -0.4002420902252197, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 1.055647611618042, "rewards/margins": 0.034774065017700195, "rewards/rejected": 1.0208735466003418, "step": 5827 }, { "epoch": 3.14, "learning_rate": 1.1551567182484607e-08, "logits/chosen": -2.002636671066284, "logits/rejected": -2.2476987838745117, "logps/chosen": -2.0536346435546875, "logps/rejected": -1.102752685546875, "loss": 0.6633, "rewards/accuracies": 1.0, "rewards/chosen": 0.8083108067512512, "rewards/margins": 0.06061506271362305, "rewards/rejected": 0.7476957440376282, "step": 5828 }, { "epoch": 3.14, "learning_rate": 1.1537610209771076e-08, "logits/chosen": -1.9646754264831543, "logits/rejected": -2.3736250400543213, "logps/chosen": -5.726910591125488, "logps/rejected": -6.333251953125, "loss": 0.642, "rewards/accuracies": 1.0, "rewards/chosen": 1.25468111038208, "rewards/margins": 0.1049656867980957, "rewards/rejected": 1.1497154235839844, "step": 5829 }, { "epoch": 3.14, "learning_rate": 1.1523660574013016e-08, "logits/chosen": -2.0446903705596924, "logits/rejected": -2.041470766067505, "logps/chosen": -5.157448768615723, "logps/rejected": -4.256810188293457, "loss": 0.2929, "rewards/accuracies": 1.0, "rewards/chosen": 1.5465697050094604, "rewards/margins": 1.0778412818908691, "rewards/rejected": 0.46872836351394653, "step": 5830 }, { "epoch": 3.15, "learning_rate": 1.1509718277871405e-08, "logits/chosen": -2.025935173034668, "logits/rejected": -2.0260045528411865, "logps/chosen": -1.00868821144104, "logps/rejected": -2.66253399848938, "loss": 0.4979, "rewards/accuracies": 1.0, "rewards/chosen": 1.3280670642852783, "rewards/margins": 0.4381822943687439, "rewards/rejected": 0.8898847699165344, "step": 5831 }, { "epoch": 3.15, "learning_rate": 1.1495783324005825e-08, "logits/chosen": -1.9893498420715332, "logits/rejected": -2.299536943435669, "logps/chosen": -2.159437417984009, "logps/rejected": -11.321647644042969, "loss": 0.5887, "rewards/accuracies": 1.0, "rewards/chosen": 0.954003632068634, "rewards/margins": 0.2210419774055481, "rewards/rejected": 0.7329616546630859, "step": 5832 }, { "epoch": 3.15, "learning_rate": 1.1481855715074462e-08, "logits/chosen": -2.132519483566284, "logits/rejected": -2.1066529750823975, "logps/chosen": -15.848737716674805, "logps/rejected": -9.518364906311035, "loss": 0.1668, "rewards/accuracies": 1.0, "rewards/chosen": 1.8170636892318726, "rewards/margins": 1.7062124013900757, "rewards/rejected": 0.11085128784179688, "step": 5833 }, { "epoch": 3.15, "learning_rate": 1.14679354537341e-08, "logits/chosen": -2.078524351119995, "logits/rejected": -2.0744850635528564, "logps/chosen": -2.43886137008667, "logps/rejected": -6.170592784881592, "loss": 0.4338, "rewards/accuracies": 1.0, "rewards/chosen": 1.3028711080551147, "rewards/margins": 0.6103267073631287, "rewards/rejected": 0.6925444006919861, "step": 5834 }, { "epoch": 3.15, "learning_rate": 1.1454022542640123e-08, "logits/chosen": -2.1092891693115234, "logits/rejected": -2.1084821224212646, "logps/chosen": -0.5430690050125122, "logps/rejected": -10.68705940246582, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": 1.1123895645141602, "rewards/margins": 0.4638112783432007, "rewards/rejected": 0.6485782861709595, "step": 5835 }, { "epoch": 3.15, "learning_rate": 1.1440116984446503e-08, "logits/chosen": -2.140252113342285, "logits/rejected": -2.1067395210266113, "logps/chosen": -22.508377075195312, "logps/rejected": -15.265113830566406, "loss": 0.262, "rewards/accuracies": 1.0, "rewards/chosen": 2.247901201248169, "rewards/margins": 1.20558762550354, "rewards/rejected": 1.042313575744629, "step": 5836 }, { "epoch": 3.15, "learning_rate": 1.1426218781805824e-08, "logits/chosen": -2.1643571853637695, "logits/rejected": -2.055518865585327, "logps/chosen": -20.20201301574707, "logps/rejected": -1.0824098587036133, "loss": 0.2331, "rewards/accuracies": 1.0, "rewards/chosen": 2.2152342796325684, "rewards/margins": 1.3374266624450684, "rewards/rejected": 0.8778075575828552, "step": 5837 }, { "epoch": 3.15, "learning_rate": 1.1412327937369259e-08, "logits/chosen": -2.050745725631714, "logits/rejected": -2.0498721599578857, "logps/chosen": -6.046751976013184, "logps/rejected": -6.657005310058594, "loss": 0.4061, "rewards/accuracies": 1.0, "rewards/chosen": 1.1156480312347412, "rewards/margins": 0.6912270784378052, "rewards/rejected": 0.42442092299461365, "step": 5838 }, { "epoch": 3.15, "learning_rate": 1.1398444453786594e-08, "logits/chosen": -2.0606305599212646, "logits/rejected": -2.0705292224884033, "logps/chosen": -2.9321231842041016, "logps/rejected": -3.8600642681121826, "loss": 0.3834, "rewards/accuracies": 1.0, "rewards/chosen": 1.2939904928207397, "rewards/margins": 0.761013925075531, "rewards/rejected": 0.5329765677452087, "step": 5839 }, { "epoch": 3.15, "learning_rate": 1.1384568333706151e-08, "logits/chosen": -2.126743793487549, "logits/rejected": -2.130474328994751, "logps/chosen": -1.9837427139282227, "logps/rejected": -7.138232231140137, "loss": 0.3839, "rewards/accuracies": 1.0, "rewards/chosen": 1.4238845109939575, "rewards/margins": 0.7593525052070618, "rewards/rejected": 0.6645320057868958, "step": 5840 }, { "epoch": 3.15, "learning_rate": 1.1370699579774934e-08, "logits/chosen": -2.0513415336608887, "logits/rejected": -2.0484488010406494, "logps/chosen": -1.2694674730300903, "logps/rejected": -5.888720989227295, "loss": 0.4157, "rewards/accuracies": 1.0, "rewards/chosen": 0.9121382832527161, "rewards/margins": 0.6628614664077759, "rewards/rejected": 0.2492767870426178, "step": 5841 }, { "epoch": 3.15, "learning_rate": 1.1356838194638485e-08, "logits/chosen": -2.0954818725585938, "logits/rejected": -2.095306634902954, "logps/chosen": -2.6675963401794434, "logps/rejected": -12.6126127243042, "loss": 0.3305, "rewards/accuracies": 1.0, "rewards/chosen": 1.1342943906784058, "rewards/margins": 0.9372167587280273, "rewards/rejected": 0.1970776617527008, "step": 5842 }, { "epoch": 3.15, "learning_rate": 1.134298418094095e-08, "logits/chosen": -2.1308393478393555, "logits/rejected": -2.261035919189453, "logps/chosen": -0.10325711965560913, "logps/rejected": -0.12288473546504974, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.9532351493835449, "rewards/margins": 0.021421611309051514, "rewards/rejected": 0.9318135380744934, "step": 5843 }, { "epoch": 3.15, "learning_rate": 1.132913754132509e-08, "logits/chosen": -2.162209987640381, "logits/rejected": -2.1694071292877197, "logps/chosen": -1.3989191055297852, "logps/rejected": -3.792407274246216, "loss": 0.4858, "rewards/accuracies": 1.0, "rewards/chosen": 1.0459256172180176, "rewards/margins": 0.4692743420600891, "rewards/rejected": 0.5766512751579285, "step": 5844 }, { "epoch": 3.15, "learning_rate": 1.1315298278432228e-08, "logits/chosen": -2.194352626800537, "logits/rejected": -2.3777315616607666, "logps/chosen": -0.4889959394931793, "logps/rejected": -0.47781720757484436, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 1.0828384160995483, "rewards/margins": -0.001865386962890625, "rewards/rejected": 1.084703803062439, "step": 5845 }, { "epoch": 3.15, "learning_rate": 1.1301466394902299e-08, "logits/chosen": -2.1228911876678467, "logits/rejected": -2.351942300796509, "logps/chosen": -0.18228518962860107, "logps/rejected": -0.21724796295166016, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8616743087768555, "rewards/margins": 0.018205642700195312, "rewards/rejected": 0.8434686660766602, "step": 5846 }, { "epoch": 3.15, "learning_rate": 1.1287641893373828e-08, "logits/chosen": -2.145928144454956, "logits/rejected": -2.3692688941955566, "logps/chosen": -0.04357811063528061, "logps/rejected": -0.0450587272644043, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.768572986125946, "rewards/margins": 0.032310664653778076, "rewards/rejected": 0.736262321472168, "step": 5847 }, { "epoch": 3.15, "learning_rate": 1.1273824776483931e-08, "logits/chosen": -2.152203321456909, "logits/rejected": -2.301867723464966, "logps/chosen": -1.335790991783142, "logps/rejected": -1.5348827838897705, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 0.6775926351547241, "rewards/margins": 0.03803497552871704, "rewards/rejected": 0.6395576596260071, "step": 5848 }, { "epoch": 3.15, "learning_rate": 1.126001504686831e-08, "logits/chosen": -2.08201003074646, "logits/rejected": -2.3020665645599365, "logps/chosen": -3.7248940467834473, "logps/rejected": -2.8518667221069336, "loss": 0.721, "rewards/accuracies": 0.0, "rewards/chosen": 0.6055026650428772, "rewards/margins": -0.05492454767227173, "rewards/rejected": 0.6604272127151489, "step": 5849 }, { "epoch": 3.16, "learning_rate": 1.1246212707161268e-08, "logits/chosen": -2.229382038116455, "logits/rejected": -2.2843706607818604, "logps/chosen": -5.01538610458374, "logps/rejected": -17.203458786010742, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": 1.4695833921432495, "rewards/margins": 0.972699761390686, "rewards/rejected": 0.4968836009502411, "step": 5850 }, { "epoch": 3.16, "learning_rate": 1.1232417759995678e-08, "logits/chosen": -1.9621022939682007, "logits/rejected": -2.22860050201416, "logps/chosen": -0.9011348485946655, "logps/rejected": -0.7196077108383179, "loss": 0.6681, "rewards/accuracies": 1.0, "rewards/chosen": 1.0085828304290771, "rewards/margins": 0.05080133676528931, "rewards/rejected": 0.9577814936637878, "step": 5851 }, { "epoch": 3.16, "learning_rate": 1.1218630208003049e-08, "logits/chosen": -2.118537187576294, "logits/rejected": -2.1368842124938965, "logps/chosen": -3.870051383972168, "logps/rejected": -3.9609367847442627, "loss": 0.4338, "rewards/accuracies": 1.0, "rewards/chosen": 1.3193897008895874, "rewards/margins": 0.6105394959449768, "rewards/rejected": 0.7088502049446106, "step": 5852 }, { "epoch": 3.16, "learning_rate": 1.120485005381342e-08, "logits/chosen": -1.9733473062515259, "logits/rejected": -2.326516628265381, "logps/chosen": -2.237431287765503, "logps/rejected": -2.289252281188965, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.8877385258674622, "rewards/margins": -0.009401500225067139, "rewards/rejected": 0.8971400260925293, "step": 5853 }, { "epoch": 3.16, "learning_rate": 1.1191077300055456e-08, "logits/chosen": -2.1220335960388184, "logits/rejected": -2.2395248413085938, "logps/chosen": -0.1780235916376114, "logps/rejected": -0.213703915476799, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 1.007564663887024, "rewards/margins": 0.02002429962158203, "rewards/rejected": 0.9875403642654419, "step": 5854 }, { "epoch": 3.16, "learning_rate": 1.11773119493564e-08, "logits/chosen": -2.1243603229522705, "logits/rejected": -2.401280641555786, "logps/chosen": -10.99085521697998, "logps/rejected": -7.195013999938965, "loss": 0.7799, "rewards/accuracies": 0.0, "rewards/chosen": 1.0846600532531738, "rewards/margins": -0.16664397716522217, "rewards/rejected": 1.251304030418396, "step": 5855 }, { "epoch": 3.16, "learning_rate": 1.116355400434208e-08, "logits/chosen": -2.069692850112915, "logits/rejected": -2.2786920070648193, "logps/chosen": -4.394425392150879, "logps/rejected": -0.8729344010353088, "loss": 0.8518, "rewards/accuracies": 0.0, "rewards/chosen": 0.8076937794685364, "rewards/margins": -0.2955586314201355, "rewards/rejected": 1.1032524108886719, "step": 5856 }, { "epoch": 3.16, "learning_rate": 1.114980346763692e-08, "logits/chosen": -2.02644419670105, "logits/rejected": -2.2987568378448486, "logps/chosen": -0.21808011829853058, "logps/rejected": -0.22672487795352936, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.996200680732727, "rewards/margins": 0.0005577206611633301, "rewards/rejected": 0.9956429600715637, "step": 5857 }, { "epoch": 3.16, "learning_rate": 1.1136060341863922e-08, "logits/chosen": -1.961355447769165, "logits/rejected": -2.3100686073303223, "logps/chosen": -0.8413528800010681, "logps/rejected": -0.9854430556297302, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.8858651518821716, "rewards/margins": 0.018799304962158203, "rewards/rejected": 0.8670658469200134, "step": 5858 }, { "epoch": 3.16, "learning_rate": 1.1122324629644685e-08, "logits/chosen": -2.0524420738220215, "logits/rejected": -2.321629285812378, "logps/chosen": -1.8086016178131104, "logps/rejected": -2.0036261081695557, "loss": 0.6612, "rewards/accuracies": 1.0, "rewards/chosen": 0.921395480632782, "rewards/margins": 0.06495243310928345, "rewards/rejected": 0.8564430475234985, "step": 5859 }, { "epoch": 3.16, "learning_rate": 1.1108596333599384e-08, "logits/chosen": -2.025841474533081, "logits/rejected": -2.1859374046325684, "logps/chosen": -0.598721981048584, "logps/rejected": -0.5598853230476379, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9599895477294922, "rewards/margins": 0.0014093518257141113, "rewards/rejected": 0.9585801959037781, "step": 5860 }, { "epoch": 3.16, "learning_rate": 1.109487545634678e-08, "logits/chosen": -2.150193929672241, "logits/rejected": -2.15954327583313, "logps/chosen": -3.9771106243133545, "logps/rejected": -5.001987934112549, "loss": 0.3786, "rewards/accuracies": 1.0, "rewards/chosen": 1.2716155052185059, "rewards/margins": 0.7759706974029541, "rewards/rejected": 0.49564480781555176, "step": 5861 }, { "epoch": 3.16, "learning_rate": 1.1081162000504207e-08, "logits/chosen": -2.1429443359375, "logits/rejected": -2.1394543647766113, "logps/chosen": -5.510794639587402, "logps/rejected": -2.7243316173553467, "loss": 0.352, "rewards/accuracies": 1.0, "rewards/chosen": 1.5401090383529663, "rewards/margins": 0.8630425930023193, "rewards/rejected": 0.677066445350647, "step": 5862 }, { "epoch": 3.16, "learning_rate": 1.1067455968687623e-08, "logits/chosen": -2.022094964981079, "logits/rejected": -2.241351842880249, "logps/chosen": -0.31253308057785034, "logps/rejected": -0.3377961218357086, "loss": 0.6719, "rewards/accuracies": 1.0, "rewards/chosen": 1.0739518404006958, "rewards/margins": 0.04297482967376709, "rewards/rejected": 1.0309770107269287, "step": 5863 }, { "epoch": 3.16, "learning_rate": 1.105375736351154e-08, "logits/chosen": -2.195866823196411, "logits/rejected": -2.258383274078369, "logps/chosen": -3.5475847721099854, "logps/rejected": -5.123040199279785, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 1.039942741394043, "rewards/margins": 0.02174246311187744, "rewards/rejected": 1.0182002782821655, "step": 5864 }, { "epoch": 3.16, "learning_rate": 1.1040066187589048e-08, "logits/chosen": -2.1922643184661865, "logits/rejected": -2.184217929840088, "logps/chosen": -6.242149353027344, "logps/rejected": -5.37895393371582, "loss": 0.3583, "rewards/accuracies": 1.0, "rewards/chosen": 1.297784447669983, "rewards/margins": 0.8419840931892395, "rewards/rejected": 0.4558003544807434, "step": 5865 }, { "epoch": 3.16, "learning_rate": 1.1026382443531834e-08, "logits/chosen": -2.1302270889282227, "logits/rejected": -2.132779359817505, "logps/chosen": -0.6856104731559753, "logps/rejected": -4.280158042907715, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 1.0318104028701782, "rewards/margins": 0.5606594085693359, "rewards/rejected": 0.4711509644985199, "step": 5866 }, { "epoch": 3.16, "learning_rate": 1.1012706133950162e-08, "logits/chosen": -2.051835060119629, "logits/rejected": -2.2647507190704346, "logps/chosen": -0.43706145882606506, "logps/rejected": -0.42958471179008484, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 1.031093716621399, "rewards/margins": 0.028705358505249023, "rewards/rejected": 1.00238835811615, "step": 5867 }, { "epoch": 3.17, "learning_rate": 1.0999037261452882e-08, "logits/chosen": -2.212829113006592, "logits/rejected": -2.3609907627105713, "logps/chosen": -14.172637939453125, "logps/rejected": -16.050161361694336, "loss": 0.5411, "rewards/accuracies": 1.0, "rewards/chosen": 1.362287163734436, "rewards/margins": 0.33142948150634766, "rewards/rejected": 1.0308576822280884, "step": 5868 }, { "epoch": 3.17, "learning_rate": 1.0985375828647431e-08, "logits/chosen": -2.2279651165008545, "logits/rejected": -2.2209486961364746, "logps/chosen": -1.440797209739685, "logps/rejected": -10.667476654052734, "loss": 0.3469, "rewards/accuracies": 1.0, "rewards/chosen": 1.3872474431991577, "rewards/margins": 0.8804086446762085, "rewards/rejected": 0.5068387985229492, "step": 5869 }, { "epoch": 3.17, "learning_rate": 1.0971721838139786e-08, "logits/chosen": -2.027834177017212, "logits/rejected": -2.2353804111480713, "logps/chosen": -0.4951547384262085, "logps/rejected": -0.4631499648094177, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 1.0023313760757446, "rewards/margins": 0.03815460205078125, "rewards/rejected": 0.9641767740249634, "step": 5870 }, { "epoch": 3.17, "learning_rate": 1.0958075292534558e-08, "logits/chosen": -2.0776493549346924, "logits/rejected": -2.393573760986328, "logps/chosen": -0.06430898606777191, "logps/rejected": -0.06720609962940216, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 0.9388934969902039, "rewards/margins": -0.010283231735229492, "rewards/rejected": 0.9491767287254333, "step": 5871 }, { "epoch": 3.17, "learning_rate": 1.0944436194434914e-08, "logits/chosen": -2.0637195110321045, "logits/rejected": -2.254021167755127, "logps/chosen": -0.25493213534355164, "logps/rejected": -0.23963993787765503, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.7182108759880066, "rewards/margins": 0.039073169231414795, "rewards/rejected": 0.6791377067565918, "step": 5872 }, { "epoch": 3.17, "learning_rate": 1.09308045464426e-08, "logits/chosen": -2.075315475463867, "logits/rejected": -2.249920606613159, "logps/chosen": -0.4359569251537323, "logps/rejected": -0.44190341234207153, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.8859163522720337, "rewards/margins": 0.009199142456054688, "rewards/rejected": 0.876717209815979, "step": 5873 }, { "epoch": 3.17, "learning_rate": 1.0917180351157934e-08, "logits/chosen": -2.1344597339630127, "logits/rejected": -2.347139596939087, "logps/chosen": -1.436582088470459, "logps/rejected": -1.4761438369750977, "loss": 0.6714, "rewards/accuracies": 1.0, "rewards/chosen": 0.9979705810546875, "rewards/margins": 0.04406183958053589, "rewards/rejected": 0.9539087414741516, "step": 5874 }, { "epoch": 3.17, "learning_rate": 1.0903563611179845e-08, "logits/chosen": -2.251725435256958, "logits/rejected": -2.1986610889434814, "logps/chosen": -20.493871688842773, "logps/rejected": -3.683448076248169, "loss": 0.1825, "rewards/accuracies": 1.0, "rewards/chosen": 2.439054250717163, "rewards/margins": 1.6083085536956787, "rewards/rejected": 0.8307456970214844, "step": 5875 }, { "epoch": 3.17, "learning_rate": 1.08899543291058e-08, "logits/chosen": -2.252232551574707, "logits/rejected": -2.166679620742798, "logps/chosen": -19.417545318603516, "logps/rejected": -6.322509288787842, "loss": 0.3405, "rewards/accuracies": 1.0, "rewards/chosen": 1.690092921257019, "rewards/margins": 0.9021140933036804, "rewards/rejected": 0.7879788279533386, "step": 5876 }, { "epoch": 3.17, "learning_rate": 1.0876352507531866e-08, "logits/chosen": -2.0700807571411133, "logits/rejected": -2.2624878883361816, "logps/chosen": -0.3467206656932831, "logps/rejected": -0.31723645329475403, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 0.7997633814811707, "rewards/margins": -0.009056389331817627, "rewards/rejected": 0.8088197708129883, "step": 5877 }, { "epoch": 3.17, "learning_rate": 1.0862758149052676e-08, "logits/chosen": -2.083846092224121, "logits/rejected": -2.2956347465515137, "logps/chosen": -0.27193090319633484, "logps/rejected": -0.2389393448829651, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.9071308970451355, "rewards/margins": 0.03328949213027954, "rewards/rejected": 0.873841404914856, "step": 5878 }, { "epoch": 3.17, "learning_rate": 1.0849171256261447e-08, "logits/chosen": -2.16131854057312, "logits/rejected": -2.345167398452759, "logps/chosen": -0.7411091923713684, "logps/rejected": -5.437022686004639, "loss": 0.627, "rewards/accuracies": 1.0, "rewards/chosen": 0.9391180276870728, "rewards/margins": 0.13687878847122192, "rewards/rejected": 0.8022392392158508, "step": 5879 }, { "epoch": 3.17, "learning_rate": 1.0835591831749962e-08, "logits/chosen": -2.1504404544830322, "logits/rejected": -2.3672661781311035, "logps/chosen": -0.4706314504146576, "logps/rejected": -0.4725411832332611, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.86725914478302, "rewards/margins": 0.02394479513168335, "rewards/rejected": 0.8433143496513367, "step": 5880 }, { "epoch": 3.17, "learning_rate": 1.0822019878108596e-08, "logits/chosen": -1.9993364810943604, "logits/rejected": -2.2633841037750244, "logps/chosen": -0.669164776802063, "logps/rejected": -0.7652424573898315, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.7824293971061707, "rewards/margins": -0.010596036911010742, "rewards/rejected": 0.7930254340171814, "step": 5881 }, { "epoch": 3.17, "learning_rate": 1.0808455397926281e-08, "logits/chosen": -2.0950236320495605, "logits/rejected": -2.2438807487487793, "logps/chosen": -6.378908157348633, "logps/rejected": -10.576336860656738, "loss": 0.5971, "rewards/accuracies": 1.0, "rewards/chosen": 0.8386171460151672, "rewards/margins": 0.20231389999389648, "rewards/rejected": 0.6363032460212708, "step": 5882 }, { "epoch": 3.17, "learning_rate": 1.0794898393790536e-08, "logits/chosen": -2.116826295852661, "logits/rejected": -2.2896344661712646, "logps/chosen": -1.3476498126983643, "logps/rejected": -1.6425689458847046, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.882093608379364, "rewards/margins": 0.008935928344726562, "rewards/rejected": 0.8731576800346375, "step": 5883 }, { "epoch": 3.17, "learning_rate": 1.0781348868287448e-08, "logits/chosen": -2.0962493419647217, "logits/rejected": -2.1061623096466064, "logps/chosen": -3.7647714614868164, "logps/rejected": -11.309331893920898, "loss": 0.6689, "rewards/accuracies": 1.0, "rewards/chosen": 0.8411822319030762, "rewards/margins": 0.049111247062683105, "rewards/rejected": 0.7920709848403931, "step": 5884 }, { "epoch": 3.17, "learning_rate": 1.076780682400168e-08, "logits/chosen": -2.0498857498168945, "logits/rejected": -2.3108460903167725, "logps/chosen": -0.3802779018878937, "logps/rejected": -0.45792439579963684, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 1.0669292211532593, "rewards/margins": 0.0024805068969726562, "rewards/rejected": 1.0644487142562866, "step": 5885 }, { "epoch": 3.17, "learning_rate": 1.0754272263516462e-08, "logits/chosen": -2.1083126068115234, "logits/rejected": -2.1095306873321533, "logps/chosen": -1.0234869718551636, "logps/rejected": -2.139220714569092, "loss": 0.5815, "rewards/accuracies": 1.0, "rewards/chosen": 1.176753044128418, "rewards/margins": 0.23734474182128906, "rewards/rejected": 0.9394083023071289, "step": 5886 }, { "epoch": 3.18, "learning_rate": 1.0740745189413608e-08, "logits/chosen": -2.1235110759735107, "logits/rejected": -2.1109604835510254, "logps/chosen": -2.777116060256958, "logps/rejected": -5.91124963760376, "loss": 0.3951, "rewards/accuracies": 1.0, "rewards/chosen": 1.0741370916366577, "rewards/margins": 0.7245773077011108, "rewards/rejected": 0.3495597541332245, "step": 5887 }, { "epoch": 3.18, "learning_rate": 1.0727225604273488e-08, "logits/chosen": -2.091641426086426, "logits/rejected": -2.316188097000122, "logps/chosen": -11.689264297485352, "logps/rejected": -11.106022834777832, "loss": 0.5578, "rewards/accuracies": 1.0, "rewards/chosen": 1.3371893167495728, "rewards/margins": 0.29180824756622314, "rewards/rejected": 1.0453810691833496, "step": 5888 }, { "epoch": 3.18, "learning_rate": 1.0713713510675059e-08, "logits/chosen": -2.02766752243042, "logits/rejected": -2.037702798843384, "logps/chosen": -7.6589250564575195, "logps/rejected": -1.5116748809814453, "loss": 0.6019, "rewards/accuracies": 1.0, "rewards/chosen": 1.2773741483688354, "rewards/margins": 0.1917133331298828, "rewards/rejected": 1.0856608152389526, "step": 5889 }, { "epoch": 3.18, "learning_rate": 1.070020891119584e-08, "logits/chosen": -2.073195219039917, "logits/rejected": -2.06976580619812, "logps/chosen": -1.937161922454834, "logps/rejected": -4.123862266540527, "loss": 0.5095, "rewards/accuracies": 1.0, "rewards/chosen": 0.9756439328193665, "rewards/margins": 0.408669114112854, "rewards/rejected": 0.5669748187065125, "step": 5890 }, { "epoch": 3.18, "learning_rate": 1.0686711808411925e-08, "logits/chosen": -2.092684507369995, "logits/rejected": -2.098726511001587, "logps/chosen": -1.4191359281539917, "logps/rejected": -2.4837429523468018, "loss": 0.4663, "rewards/accuracies": 1.0, "rewards/chosen": 1.1886183023452759, "rewards/margins": 0.5208603739738464, "rewards/rejected": 0.6677579283714294, "step": 5891 }, { "epoch": 3.18, "learning_rate": 1.0673222204897975e-08, "logits/chosen": -2.130276679992676, "logits/rejected": -2.3215315341949463, "logps/chosen": -0.4269047975540161, "logps/rejected": -0.4574316740036011, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 1.0536768436431885, "rewards/margins": 0.015107512474060059, "rewards/rejected": 1.0385693311691284, "step": 5892 }, { "epoch": 3.18, "learning_rate": 1.0659740103227216e-08, "logits/chosen": -1.9848047494888306, "logits/rejected": -1.9980219602584839, "logps/chosen": -1.1599066257476807, "logps/rejected": -7.468216896057129, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 1.3458073139190674, "rewards/margins": 0.7971479892730713, "rewards/rejected": 0.5486593246459961, "step": 5893 }, { "epoch": 3.18, "learning_rate": 1.0646265505971458e-08, "logits/chosen": -2.076263904571533, "logits/rejected": -2.0768024921417236, "logps/chosen": -0.6146270036697388, "logps/rejected": -4.881604194641113, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": 1.0728996992111206, "rewards/margins": 0.6767170429229736, "rewards/rejected": 0.3961826264858246, "step": 5894 }, { "epoch": 3.18, "learning_rate": 1.0632798415701061e-08, "logits/chosen": -2.109421730041504, "logits/rejected": -2.116651773452759, "logps/chosen": -0.3737088739871979, "logps/rejected": -16.774795532226562, "loss": 0.3361, "rewards/accuracies": 1.0, "rewards/chosen": 0.963265061378479, "rewards/margins": 0.9177137613296509, "rewards/rejected": 0.045551300048828125, "step": 5895 }, { "epoch": 3.18, "learning_rate": 1.0619338834984948e-08, "logits/chosen": -2.0276107788085938, "logits/rejected": -2.022488594055176, "logps/chosen": -5.7899065017700195, "logps/rejected": -4.0710930824279785, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 1.6741119623184204, "rewards/margins": 1.1019365787506104, "rewards/rejected": 0.5721753835678101, "step": 5896 }, { "epoch": 3.18, "learning_rate": 1.0605886766390659e-08, "logits/chosen": -2.1238269805908203, "logits/rejected": -2.061162233352661, "logps/chosen": -11.936901092529297, "logps/rejected": -22.489564895629883, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": 1.7738968133926392, "rewards/margins": 1.570381760597229, "rewards/rejected": 0.20351505279541016, "step": 5897 }, { "epoch": 3.18, "learning_rate": 1.0592442212484253e-08, "logits/chosen": -2.0904364585876465, "logits/rejected": -2.287062883377075, "logps/chosen": -0.1189185380935669, "logps/rejected": -0.1063171848654747, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.6759101748466492, "rewards/margins": 0.013872861862182617, "rewards/rejected": 0.6620373129844666, "step": 5898 }, { "epoch": 3.18, "learning_rate": 1.0579005175830352e-08, "logits/chosen": -2.1209468841552734, "logits/rejected": -2.104473352432251, "logps/chosen": -3.717761993408203, "logps/rejected": -5.097622871398926, "loss": 0.4431, "rewards/accuracies": 1.0, "rewards/chosen": 1.2014497518539429, "rewards/margins": 0.584288477897644, "rewards/rejected": 0.6171612739562988, "step": 5899 }, { "epoch": 3.18, "learning_rate": 1.0565575658992171e-08, "logits/chosen": -2.0315921306610107, "logits/rejected": -2.0320193767547607, "logps/chosen": -0.8261847496032715, "logps/rejected": -3.6642794609069824, "loss": 0.5207, "rewards/accuracies": 1.0, "rewards/chosen": 1.1409438848495483, "rewards/margins": 0.3810684084892273, "rewards/rejected": 0.759875476360321, "step": 5900 }, { "epoch": 3.18, "learning_rate": 1.0552153664531472e-08, "logits/chosen": -2.09857177734375, "logits/rejected": -2.1030633449554443, "logps/chosen": -1.9802558422088623, "logps/rejected": -6.033553600311279, "loss": 0.3634, "rewards/accuracies": 1.0, "rewards/chosen": 1.1416515111923218, "rewards/margins": 0.8251023292541504, "rewards/rejected": 0.316549152135849, "step": 5901 }, { "epoch": 3.18, "learning_rate": 1.0538739195008594e-08, "logits/chosen": -2.1214747428894043, "logits/rejected": -2.2909908294677734, "logps/chosen": -1.062427282333374, "logps/rejected": -0.9926292896270752, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.6206154227256775, "rewards/margins": 0.0020664334297180176, "rewards/rejected": 0.6185489892959595, "step": 5902 }, { "epoch": 3.18, "learning_rate": 1.0525332252982434e-08, "logits/chosen": -2.0445125102996826, "logits/rejected": -2.036367654800415, "logps/chosen": -8.219093322753906, "logps/rejected": -0.7458199858665466, "loss": 0.5154, "rewards/accuracies": 1.0, "rewards/chosen": 1.306650161743164, "rewards/margins": 0.3941687345504761, "rewards/rejected": 0.912481427192688, "step": 5903 }, { "epoch": 3.18, "learning_rate": 1.0511932841010457e-08, "logits/chosen": -2.18957781791687, "logits/rejected": -2.186703681945801, "logps/chosen": -0.7892951965332031, "logps/rejected": -6.301600456237793, "loss": 0.4342, "rewards/accuracies": 1.0, "rewards/chosen": 1.053178310394287, "rewards/margins": 0.6093894243240356, "rewards/rejected": 0.44378891587257385, "step": 5904 }, { "epoch": 3.19, "learning_rate": 1.0498540961648694e-08, "logits/chosen": -2.1554017066955566, "logits/rejected": -2.350951671600342, "logps/chosen": -0.264741450548172, "logps/rejected": -0.2459443360567093, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.874951183795929, "rewards/margins": 0.029060065746307373, "rewards/rejected": 0.8458911180496216, "step": 5905 }, { "epoch": 3.19, "learning_rate": 1.0485156617451724e-08, "logits/chosen": -2.198896646499634, "logits/rejected": -2.312070846557617, "logps/chosen": -4.882683277130127, "logps/rejected": -2.6050686836242676, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.728329598903656, "rewards/margins": 0.02843165397644043, "rewards/rejected": 0.6998979449272156, "step": 5906 }, { "epoch": 3.19, "learning_rate": 1.0471779810972692e-08, "logits/chosen": -2.005794048309326, "logits/rejected": -2.0254833698272705, "logps/chosen": -1.94967520236969, "logps/rejected": -5.005408763885498, "loss": 0.4258, "rewards/accuracies": 1.0, "rewards/chosen": 1.237170696258545, "rewards/margins": 0.6334639191627502, "rewards/rejected": 0.6037067770957947, "step": 5907 }, { "epoch": 3.19, "learning_rate": 1.045841054476334e-08, "logits/chosen": -2.176427125930786, "logits/rejected": -2.091243028640747, "logps/chosen": -32.53902053833008, "logps/rejected": -8.539407730102539, "loss": 0.2067, "rewards/accuracies": 1.0, "rewards/chosen": 2.4875686168670654, "rewards/margins": 1.4712468385696411, "rewards/rejected": 1.0163217782974243, "step": 5908 }, { "epoch": 3.19, "learning_rate": 1.0445048821373931e-08, "logits/chosen": -2.128774881362915, "logits/rejected": -2.146188735961914, "logps/chosen": -7.111677169799805, "logps/rejected": -4.476313591003418, "loss": 0.4071, "rewards/accuracies": 1.0, "rewards/chosen": 1.3542982339859009, "rewards/margins": 0.6882279515266418, "rewards/rejected": 0.666070282459259, "step": 5909 }, { "epoch": 3.19, "learning_rate": 1.0431694643353301e-08, "logits/chosen": -2.229605197906494, "logits/rejected": -2.113710403442383, "logps/chosen": -14.982200622558594, "logps/rejected": -12.301261901855469, "loss": 0.4399, "rewards/accuracies": 1.0, "rewards/chosen": 1.795204997062683, "rewards/margins": 0.5931471586227417, "rewards/rejected": 1.2020578384399414, "step": 5910 }, { "epoch": 3.19, "learning_rate": 1.0418348013248845e-08, "logits/chosen": -2.0210070610046387, "logits/rejected": -2.0202012062072754, "logps/chosen": -0.6601815223693848, "logps/rejected": -3.6824986934661865, "loss": 0.4486, "rewards/accuracies": 1.0, "rewards/chosen": 1.040932297706604, "rewards/margins": 0.5689853429794312, "rewards/rejected": 0.47194692492485046, "step": 5911 }, { "epoch": 3.19, "learning_rate": 1.0405008933606524e-08, "logits/chosen": -2.1634199619293213, "logits/rejected": -2.272709608078003, "logps/chosen": -1.3275355100631714, "logps/rejected": -1.0456058979034424, "loss": 0.6403, "rewards/accuracies": 1.0, "rewards/chosen": 0.9828716516494751, "rewards/margins": 0.10862702131271362, "rewards/rejected": 0.8742446303367615, "step": 5912 }, { "epoch": 3.19, "learning_rate": 1.0391677406970856e-08, "logits/chosen": -2.2056283950805664, "logits/rejected": -2.089492082595825, "logps/chosen": -34.61700439453125, "logps/rejected": -2.6614818572998047, "loss": 0.1015, "rewards/accuracies": 1.0, "rewards/chosen": 2.8190934658050537, "rewards/margins": 2.2362027168273926, "rewards/rejected": 0.5828908085823059, "step": 5913 }, { "epoch": 3.19, "learning_rate": 1.0378353435884919e-08, "logits/chosen": -2.154094696044922, "logits/rejected": -2.155562162399292, "logps/chosen": -2.094071388244629, "logps/rejected": -1.5419528484344482, "loss": 0.6424, "rewards/accuracies": 1.0, "rewards/chosen": 1.046016812324524, "rewards/margins": 0.10419940948486328, "rewards/rejected": 0.9418174028396606, "step": 5914 }, { "epoch": 3.19, "learning_rate": 1.0365037022890343e-08, "logits/chosen": -2.144782781600952, "logits/rejected": -2.150662899017334, "logps/chosen": -1.2735183238983154, "logps/rejected": -4.150959014892578, "loss": 0.447, "rewards/accuracies": 1.0, "rewards/chosen": 0.9870533347129822, "rewards/margins": 0.5732671022415161, "rewards/rejected": 0.41378623247146606, "step": 5915 }, { "epoch": 3.19, "learning_rate": 1.0351728170527341e-08, "logits/chosen": -2.094433069229126, "logits/rejected": -2.2960872650146484, "logps/chosen": -0.746372401714325, "logps/rejected": -0.8283939361572266, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.9322831034660339, "rewards/margins": 0.03367042541503906, "rewards/rejected": 0.8986126780509949, "step": 5916 }, { "epoch": 3.19, "learning_rate": 1.0338426881334633e-08, "logits/chosen": -2.0510752201080322, "logits/rejected": -2.242323398590088, "logps/chosen": -0.16682878136634827, "logps/rejected": -0.21304351091384888, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.7856518626213074, "rewards/margins": 0.01512289047241211, "rewards/rejected": 0.7705289721488953, "step": 5917 }, { "epoch": 3.19, "learning_rate": 1.0325133157849535e-08, "logits/chosen": -2.1423261165618896, "logits/rejected": -2.3466110229492188, "logps/chosen": -1.207263469696045, "logps/rejected": -1.1890345811843872, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 1.0373191833496094, "rewards/margins": 0.022336602210998535, "rewards/rejected": 1.0149825811386108, "step": 5918 }, { "epoch": 3.19, "learning_rate": 1.0311847002607938e-08, "logits/chosen": -2.039098024368286, "logits/rejected": -2.045987606048584, "logps/chosen": -0.7248328328132629, "logps/rejected": -6.926646709442139, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 0.9836624264717102, "rewards/margins": 0.7892395257949829, "rewards/rejected": 0.1944228708744049, "step": 5919 }, { "epoch": 3.19, "learning_rate": 1.0298568418144243e-08, "logits/chosen": -2.2318482398986816, "logits/rejected": -2.3783860206604004, "logps/chosen": -1.26020348072052, "logps/rejected": -1.3300113677978516, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0653165578842163, "rewards/margins": 0.023500800132751465, "rewards/rejected": 1.0418157577514648, "step": 5920 }, { "epoch": 3.19, "learning_rate": 1.0285297406991433e-08, "logits/chosen": -2.0545427799224854, "logits/rejected": -2.3330066204071045, "logps/chosen": -0.3202785551548004, "logps/rejected": -2.7885212898254395, "loss": 0.5403, "rewards/accuracies": 1.0, "rewards/chosen": 0.9558976292610168, "rewards/margins": 0.3334263563156128, "rewards/rejected": 0.622471272945404, "step": 5921 }, { "epoch": 3.19, "learning_rate": 1.0272033971681043e-08, "logits/chosen": -2.021160840988159, "logits/rejected": -2.0169010162353516, "logps/chosen": -1.1330549716949463, "logps/rejected": -4.28863525390625, "loss": 0.4475, "rewards/accuracies": 1.0, "rewards/chosen": 1.1366541385650635, "rewards/margins": 0.5719290375709534, "rewards/rejected": 0.5647251009941101, "step": 5922 }, { "epoch": 3.19, "learning_rate": 1.025877811474316e-08, "logits/chosen": -2.0009536743164062, "logits/rejected": -2.26354718208313, "logps/chosen": -1.878105878829956, "logps/rejected": -1.5275428295135498, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.909500777721405, "rewards/margins": 0.01321101188659668, "rewards/rejected": 0.8962897658348083, "step": 5923 }, { "epoch": 3.2, "learning_rate": 1.024552983870643e-08, "logits/chosen": -2.168586492538452, "logits/rejected": -2.028970956802368, "logps/chosen": -28.827701568603516, "logps/rejected": -2.409430742263794, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 2.752051591873169, "rewards/margins": 2.0770621299743652, "rewards/rejected": 0.6749894022941589, "step": 5924 }, { "epoch": 3.2, "learning_rate": 1.0232289146098039e-08, "logits/chosen": -2.184481143951416, "logits/rejected": -2.195261001586914, "logps/chosen": -1.62003493309021, "logps/rejected": -2.8205647468566895, "loss": 0.4776, "rewards/accuracies": 1.0, "rewards/chosen": 1.1567806005477905, "rewards/margins": 0.49074214696884155, "rewards/rejected": 0.666038453578949, "step": 5925 }, { "epoch": 3.2, "learning_rate": 1.0219056039443747e-08, "logits/chosen": -2.0162158012390137, "logits/rejected": -2.2743186950683594, "logps/chosen": -4.121585845947266, "logps/rejected": -3.9406540393829346, "loss": 0.7059, "rewards/accuracies": 0.0, "rewards/chosen": 0.7614870071411133, "rewards/margins": -0.025427937507629395, "rewards/rejected": 0.7869149446487427, "step": 5926 }, { "epoch": 3.2, "learning_rate": 1.020583052126785e-08, "logits/chosen": -2.023529529571533, "logits/rejected": -2.2961578369140625, "logps/chosen": -0.29940110445022583, "logps/rejected": -0.33842164278030396, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 1.0317596197128296, "rewards/margins": 0.01939404010772705, "rewards/rejected": 1.0123655796051025, "step": 5927 }, { "epoch": 3.2, "learning_rate": 1.0192612594093202e-08, "logits/chosen": -2.129676342010498, "logits/rejected": -2.232823371887207, "logps/chosen": -0.31976816058158875, "logps/rejected": -0.42153558135032654, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.8430077433586121, "rewards/margins": 0.006608188152313232, "rewards/rejected": 0.8363995552062988, "step": 5928 }, { "epoch": 3.2, "learning_rate": 1.0179402260441223e-08, "logits/chosen": -2.020035743713379, "logits/rejected": -2.276236057281494, "logps/chosen": -0.78321373462677, "logps/rejected": -0.7843438386917114, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.9123194813728333, "rewards/margins": 0.013196289539337158, "rewards/rejected": 0.8991231918334961, "step": 5929 }, { "epoch": 3.2, "learning_rate": 1.0166199522831858e-08, "logits/chosen": -2.0897579193115234, "logits/rejected": -2.084792137145996, "logps/chosen": -0.09852654486894608, "logps/rejected": -12.712122917175293, "loss": 0.4639, "rewards/accuracies": 1.0, "rewards/chosen": 0.8814546465873718, "rewards/margins": 0.5272881388664246, "rewards/rejected": 0.35416650772094727, "step": 5930 }, { "epoch": 3.2, "learning_rate": 1.015300438378362e-08, "logits/chosen": -2.08868670463562, "logits/rejected": -2.273040294647217, "logps/chosen": -6.416780471801758, "logps/rejected": -2.151230812072754, "loss": 0.7287, "rewards/accuracies": 0.0, "rewards/chosen": 0.742492139339447, "rewards/margins": -0.06982982158660889, "rewards/rejected": 0.8123219609260559, "step": 5931 }, { "epoch": 3.2, "learning_rate": 1.013981684581357e-08, "logits/chosen": -2.13135027885437, "logits/rejected": -2.0447025299072266, "logps/chosen": -21.862789154052734, "logps/rejected": -2.603600263595581, "loss": 0.2557, "rewards/accuracies": 1.0, "rewards/chosen": 1.8516029119491577, "rewards/margins": 1.2332756519317627, "rewards/rejected": 0.618327260017395, "step": 5932 }, { "epoch": 3.2, "learning_rate": 1.0126636911437319e-08, "logits/chosen": -2.12223482131958, "logits/rejected": -2.128059148788452, "logps/chosen": -3.7072510719299316, "logps/rejected": -5.357699394226074, "loss": 0.5733, "rewards/accuracies": 1.0, "rewards/chosen": 0.9920642971992493, "rewards/margins": 0.25602638721466064, "rewards/rejected": 0.7360379099845886, "step": 5933 }, { "epoch": 3.2, "learning_rate": 1.0113464583169029e-08, "logits/chosen": -2.0640931129455566, "logits/rejected": -2.261516571044922, "logps/chosen": -1.3456034660339355, "logps/rejected": -1.1842900514602661, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 1.0568183660507202, "rewards/margins": 0.020414233207702637, "rewards/rejected": 1.0364041328430176, "step": 5934 }, { "epoch": 3.2, "learning_rate": 1.0100299863521405e-08, "logits/chosen": -2.0617730617523193, "logits/rejected": -2.319279670715332, "logps/chosen": -0.2780047059059143, "logps/rejected": -0.28554585576057434, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.8537665605545044, "rewards/margins": 0.013660252094268799, "rewards/rejected": 0.8401063084602356, "step": 5935 }, { "epoch": 3.2, "learning_rate": 1.0087142755005706e-08, "logits/chosen": -2.184504747390747, "logits/rejected": -2.1985602378845215, "logps/chosen": -9.769623756408691, "logps/rejected": -11.053305625915527, "loss": 0.2442, "rewards/accuracies": 1.0, "rewards/chosen": 1.9974075555801392, "rewards/margins": 1.28513503074646, "rewards/rejected": 0.712272584438324, "step": 5936 }, { "epoch": 3.2, "learning_rate": 1.0073993260131735e-08, "logits/chosen": -2.0372490882873535, "logits/rejected": -2.044379472732544, "logps/chosen": -0.8232778906822205, "logps/rejected": -4.637307643890381, "loss": 0.5038, "rewards/accuracies": 1.0, "rewards/chosen": 1.0613781213760376, "rewards/margins": 0.4230652451515198, "rewards/rejected": 0.6383128762245178, "step": 5937 }, { "epoch": 3.2, "learning_rate": 1.0060851381407847e-08, "logits/chosen": -2.1415421962738037, "logits/rejected": -2.3061563968658447, "logps/chosen": -0.7203964591026306, "logps/rejected": -2.230058193206787, "loss": 0.5797, "rewards/accuracies": 1.0, "rewards/chosen": 0.998104989528656, "rewards/margins": 0.24140900373458862, "rewards/rejected": 0.7566959857940674, "step": 5938 }, { "epoch": 3.2, "learning_rate": 1.0047717121340943e-08, "logits/chosen": -1.9837464094161987, "logits/rejected": -2.3084919452667236, "logps/chosen": -1.0962297916412354, "logps/rejected": -1.4093971252441406, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.7146488428115845, "rewards/margins": 0.03124094009399414, "rewards/rejected": 0.6834079027175903, "step": 5939 }, { "epoch": 3.2, "learning_rate": 1.0034590482436473e-08, "logits/chosen": -2.0033681392669678, "logits/rejected": -2.005889415740967, "logps/chosen": -0.844314455986023, "logps/rejected": -5.880325794219971, "loss": 0.5265, "rewards/accuracies": 1.0, "rewards/chosen": 0.8245655298233032, "rewards/margins": 0.36674848198890686, "rewards/rejected": 0.45781704783439636, "step": 5940 }, { "epoch": 3.2, "learning_rate": 1.0021471467198407e-08, "logits/chosen": -2.158062219619751, "logits/rejected": -2.0324344635009766, "logps/chosen": -20.42853546142578, "logps/rejected": -2.494579792022705, "loss": 0.2503, "rewards/accuracies": 1.0, "rewards/chosen": 1.9044231176376343, "rewards/margins": 1.2573171854019165, "rewards/rejected": 0.6471059322357178, "step": 5941 }, { "epoch": 3.2, "learning_rate": 1.0008360078129318e-08, "logits/chosen": -2.0476691722869873, "logits/rejected": -2.0521273612976074, "logps/chosen": -0.3774845600128174, "logps/rejected": -4.88355016708374, "loss": 0.4651, "rewards/accuracies": 1.0, "rewards/chosen": 0.8456806540489197, "rewards/margins": 0.5238291621208191, "rewards/rejected": 0.3218514919281006, "step": 5942 }, { "epoch": 3.21, "learning_rate": 9.995256317730282e-09, "logits/chosen": -2.090029239654541, "logits/rejected": -2.279226541519165, "logps/chosen": -0.261764258146286, "logps/rejected": -0.24568241834640503, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.9340931177139282, "rewards/margins": 0.022849202156066895, "rewards/rejected": 0.9112439155578613, "step": 5943 }, { "epoch": 3.21, "learning_rate": 9.982160188500922e-09, "logits/chosen": -2.1001968383789062, "logits/rejected": -2.099609851837158, "logps/chosen": -0.649535059928894, "logps/rejected": -5.829552173614502, "loss": 0.4977, "rewards/accuracies": 1.0, "rewards/chosen": 0.8879033327102661, "rewards/margins": 0.43855130672454834, "rewards/rejected": 0.4493520259857178, "step": 5944 }, { "epoch": 3.21, "learning_rate": 9.969071692939417e-09, "logits/chosen": -2.0424721240997314, "logits/rejected": -2.321483850479126, "logps/chosen": -3.947521209716797, "logps/rejected": -3.44954252243042, "loss": 0.7113, "rewards/accuracies": 0.0, "rewards/chosen": 0.6384096145629883, "rewards/margins": -0.036007463932037354, "rewards/rejected": 0.6744170784950256, "step": 5945 }, { "epoch": 3.21, "learning_rate": 9.955990833542472e-09, "logits/chosen": -2.082304000854492, "logits/rejected": -2.301347494125366, "logps/chosen": -0.42267709970474243, "logps/rejected": -0.46368613839149475, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 0.9484460949897766, "rewards/margins": -0.010369300842285156, "rewards/rejected": 0.9588153958320618, "step": 5946 }, { "epoch": 3.21, "learning_rate": 9.942917612805352e-09, "logits/chosen": -2.0689687728881836, "logits/rejected": -2.287940263748169, "logps/chosen": -0.31077542901039124, "logps/rejected": -0.34613144397735596, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.8294020891189575, "rewards/margins": 0.0033144354820251465, "rewards/rejected": 0.8260876536369324, "step": 5947 }, { "epoch": 3.21, "learning_rate": 9.929852033221864e-09, "logits/chosen": -2.0947680473327637, "logits/rejected": -2.2861709594726562, "logps/chosen": -0.3095470070838928, "logps/rejected": -0.3211270272731781, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9888469576835632, "rewards/margins": -1.7464160919189453e-05, "rewards/rejected": 0.9888644218444824, "step": 5948 }, { "epoch": 3.21, "learning_rate": 9.916794097284347e-09, "logits/chosen": -1.9492322206497192, "logits/rejected": -2.2782888412475586, "logps/chosen": -3.035165786743164, "logps/rejected": -1.7421714067459106, "loss": 0.7615, "rewards/accuracies": 0.0, "rewards/chosen": 0.7425174713134766, "rewards/margins": -0.13242095708847046, "rewards/rejected": 0.874938428401947, "step": 5949 }, { "epoch": 3.21, "learning_rate": 9.9037438074837e-09, "logits/chosen": -2.092116117477417, "logits/rejected": -2.298543930053711, "logps/chosen": -0.5939846634864807, "logps/rejected": -0.567173957824707, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.9703460931777954, "rewards/margins": 0.03198736906051636, "rewards/rejected": 0.938358724117279, "step": 5950 }, { "epoch": 3.21, "learning_rate": 9.890701166309345e-09, "logits/chosen": -2.124852418899536, "logits/rejected": -2.28554630279541, "logps/chosen": -1.9447468519210815, "logps/rejected": -0.8975449800491333, "loss": 0.6599, "rewards/accuracies": 1.0, "rewards/chosen": 0.8910304307937622, "rewards/margins": 0.06757861375808716, "rewards/rejected": 0.823451817035675, "step": 5951 }, { "epoch": 3.21, "learning_rate": 9.877666176249238e-09, "logits/chosen": -1.9787590503692627, "logits/rejected": -1.9831442832946777, "logps/chosen": -2.809641122817993, "logps/rejected": -5.088674068450928, "loss": 0.4298, "rewards/accuracies": 1.0, "rewards/chosen": 1.4097336530685425, "rewards/margins": 0.6218740344047546, "rewards/rejected": 0.7878596186637878, "step": 5952 }, { "epoch": 3.21, "learning_rate": 9.864638839789919e-09, "logits/chosen": -2.251112222671509, "logits/rejected": -2.247846841812134, "logps/chosen": -8.0394287109375, "logps/rejected": -7.621497631072998, "loss": 0.2416, "rewards/accuracies": 1.0, "rewards/chosen": 1.4081814289093018, "rewards/margins": 1.2971619367599487, "rewards/rejected": 0.11101946979761124, "step": 5953 }, { "epoch": 3.21, "learning_rate": 9.851619159416424e-09, "logits/chosen": -2.1371397972106934, "logits/rejected": -2.2838401794433594, "logps/chosen": -3.198364734649658, "logps/rejected": -3.1011781692504883, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.8235639929771423, "rewards/margins": -0.013988375663757324, "rewards/rejected": 0.8375523686408997, "step": 5954 }, { "epoch": 3.21, "learning_rate": 9.838607137612348e-09, "logits/chosen": -2.055211067199707, "logits/rejected": -2.061025857925415, "logps/chosen": -1.4624089002609253, "logps/rejected": -4.6476569175720215, "loss": 0.4286, "rewards/accuracies": 1.0, "rewards/chosen": 1.1305583715438843, "rewards/margins": 0.6253412365913391, "rewards/rejected": 0.5052171349525452, "step": 5955 }, { "epoch": 3.21, "learning_rate": 9.825602776859815e-09, "logits/chosen": -2.0142433643341064, "logits/rejected": -2.010342597961426, "logps/chosen": -3.948734760284424, "logps/rejected": -2.1363675594329834, "loss": 0.435, "rewards/accuracies": 1.0, "rewards/chosen": 1.4071159362792969, "rewards/margins": 0.6070354580879211, "rewards/rejected": 0.8000804781913757, "step": 5956 }, { "epoch": 3.21, "learning_rate": 9.81260607963949e-09, "logits/chosen": -2.0550858974456787, "logits/rejected": -2.274733304977417, "logps/chosen": -0.37726184725761414, "logps/rejected": -0.5040039420127869, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9924919009208679, "rewards/margins": -0.00010377168655395508, "rewards/rejected": 0.9925956726074219, "step": 5957 }, { "epoch": 3.21, "learning_rate": 9.799617048430587e-09, "logits/chosen": -1.9564876556396484, "logits/rejected": -2.27994441986084, "logps/chosen": -0.24246402084827423, "logps/rejected": -0.15792030096054077, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.8438395857810974, "rewards/margins": 0.011411964893341064, "rewards/rejected": 0.8324276208877563, "step": 5958 }, { "epoch": 3.21, "learning_rate": 9.786635685710842e-09, "logits/chosen": -1.9880492687225342, "logits/rejected": -1.9969842433929443, "logps/chosen": -2.789076566696167, "logps/rejected": -4.852570056915283, "loss": 0.3609, "rewards/accuracies": 1.0, "rewards/chosen": 1.4371589422225952, "rewards/margins": 0.8332752585411072, "rewards/rejected": 0.603883683681488, "step": 5959 }, { "epoch": 3.21, "learning_rate": 9.773661993956544e-09, "logits/chosen": -1.9977281093597412, "logits/rejected": -2.0436317920684814, "logps/chosen": -4.94787073135376, "logps/rejected": -8.078607559204102, "loss": 0.2828, "rewards/accuracies": 1.0, "rewards/chosen": 1.8410106897354126, "rewards/margins": 1.1182620525360107, "rewards/rejected": 0.7227486968040466, "step": 5960 }, { "epoch": 3.22, "learning_rate": 9.760695975642502e-09, "logits/chosen": -2.0818755626678467, "logits/rejected": -2.3147668838500977, "logps/chosen": -0.8633952140808105, "logps/rejected": -4.619865417480469, "loss": 0.55, "rewards/accuracies": 1.0, "rewards/chosen": 0.8613849878311157, "rewards/margins": 0.310200035572052, "rewards/rejected": 0.5511849522590637, "step": 5961 }, { "epoch": 3.22, "learning_rate": 9.747737633242092e-09, "logits/chosen": -2.0754058361053467, "logits/rejected": -2.3319270610809326, "logps/chosen": -0.13479943573474884, "logps/rejected": -0.14289993047714233, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.7763540148735046, "rewards/margins": 0.004392683506011963, "rewards/rejected": 0.7719613313674927, "step": 5962 }, { "epoch": 3.22, "learning_rate": 9.73478696922716e-09, "logits/chosen": -2.092241048812866, "logits/rejected": -2.0997936725616455, "logps/chosen": -1.592756986618042, "logps/rejected": -4.417911529541016, "loss": 0.3853, "rewards/accuracies": 1.0, "rewards/chosen": 1.2681223154067993, "rewards/margins": 0.7550350427627563, "rewards/rejected": 0.513087272644043, "step": 5963 }, { "epoch": 3.22, "learning_rate": 9.721843986068162e-09, "logits/chosen": -2.015453338623047, "logits/rejected": -2.30871844291687, "logps/chosen": -0.3500598669052124, "logps/rejected": -0.43680524826049805, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 1.0104219913482666, "rewards/margins": 0.010849058628082275, "rewards/rejected": 0.9995729327201843, "step": 5964 }, { "epoch": 3.22, "learning_rate": 9.708908686234057e-09, "logits/chosen": -2.0730926990509033, "logits/rejected": -2.063763380050659, "logps/chosen": -10.629322052001953, "logps/rejected": -4.360072612762451, "loss": 0.3493, "rewards/accuracies": 1.0, "rewards/chosen": 1.291868805885315, "rewards/margins": 0.8720189332962036, "rewards/rejected": 0.41984984278678894, "step": 5965 }, { "epoch": 3.22, "learning_rate": 9.695981072192339e-09, "logits/chosen": -2.1045174598693848, "logits/rejected": -2.385634660720825, "logps/chosen": -3.707864284515381, "logps/rejected": -11.43004322052002, "loss": 0.8317, "rewards/accuracies": 0.0, "rewards/chosen": 0.9600157141685486, "rewards/margins": -0.2602052092552185, "rewards/rejected": 1.220220923423767, "step": 5966 }, { "epoch": 3.22, "learning_rate": 9.683061146409027e-09, "logits/chosen": -2.098863124847412, "logits/rejected": -2.100294589996338, "logps/chosen": -1.0575218200683594, "logps/rejected": -2.6307969093322754, "loss": 0.4654, "rewards/accuracies": 1.0, "rewards/chosen": 1.295889139175415, "rewards/margins": 0.5232540965080261, "rewards/rejected": 0.7726350426673889, "step": 5967 }, { "epoch": 3.22, "learning_rate": 9.670148911348686e-09, "logits/chosen": -2.0274248123168945, "logits/rejected": -2.0257534980773926, "logps/chosen": -9.41703987121582, "logps/rejected": -0.9040578007698059, "loss": 0.4769, "rewards/accuracies": 1.0, "rewards/chosen": 1.37191641330719, "rewards/margins": 0.49264591932296753, "rewards/rejected": 0.8792704939842224, "step": 5968 }, { "epoch": 3.22, "learning_rate": 9.65724436947441e-09, "logits/chosen": -2.179389715194702, "logits/rejected": -2.1185879707336426, "logps/chosen": -17.899333953857422, "logps/rejected": -4.467381954193115, "loss": 0.1316, "rewards/accuracies": 1.0, "rewards/chosen": 2.611656665802002, "rewards/margins": 1.961226463317871, "rewards/rejected": 0.6504301428794861, "step": 5969 }, { "epoch": 3.22, "learning_rate": 9.644347523247832e-09, "logits/chosen": -2.0770211219787598, "logits/rejected": -2.0853679180145264, "logps/chosen": -1.1329671144485474, "logps/rejected": -12.340158462524414, "loss": 0.554, "rewards/accuracies": 1.0, "rewards/chosen": 1.1138441562652588, "rewards/margins": 0.30077725648880005, "rewards/rejected": 0.8130668997764587, "step": 5970 }, { "epoch": 3.22, "learning_rate": 9.631458375129099e-09, "logits/chosen": -1.9530376195907593, "logits/rejected": -1.9601167440414429, "logps/chosen": -1.053885817527771, "logps/rejected": -4.278895854949951, "loss": 0.4398, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750964045524597, "rewards/margins": 0.5933926105499268, "rewards/rejected": 0.38170382380485535, "step": 5971 }, { "epoch": 3.22, "learning_rate": 9.61857692757691e-09, "logits/chosen": -2.2202208042144775, "logits/rejected": -2.28812575340271, "logps/chosen": -5.391928195953369, "logps/rejected": -1.3343572616577148, "loss": 0.7947, "rewards/accuracies": 0.0, "rewards/chosen": 1.0799211263656616, "rewards/margins": -0.1936955451965332, "rewards/rejected": 1.2736166715621948, "step": 5972 }, { "epoch": 3.22, "learning_rate": 9.605703183048487e-09, "logits/chosen": -2.122790813446045, "logits/rejected": -2.118943214416504, "logps/chosen": -6.928783416748047, "logps/rejected": -3.3161542415618896, "loss": 0.3467, "rewards/accuracies": 1.0, "rewards/chosen": 1.404078483581543, "rewards/margins": 0.88099604845047, "rewards/rejected": 0.523082435131073, "step": 5973 }, { "epoch": 3.22, "learning_rate": 9.592837143999578e-09, "logits/chosen": -1.9173717498779297, "logits/rejected": -1.9283599853515625, "logps/chosen": -2.5240440368652344, "logps/rejected": -5.899091720581055, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 1.221000075340271, "rewards/margins": 0.7969823479652405, "rewards/rejected": 0.4240177273750305, "step": 5974 }, { "epoch": 3.22, "learning_rate": 9.579978812884465e-09, "logits/chosen": -2.040062665939331, "logits/rejected": -2.3520524501800537, "logps/chosen": -0.1713855266571045, "logps/rejected": -0.2092127948999405, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.7664205431938171, "rewards/margins": -0.0018628835678100586, "rewards/rejected": 0.7682834267616272, "step": 5975 }, { "epoch": 3.22, "learning_rate": 9.56712819215596e-09, "logits/chosen": -2.143545627593994, "logits/rejected": -2.1426336765289307, "logps/chosen": -0.6921065449714661, "logps/rejected": -1.4716919660568237, "loss": 0.6316, "rewards/accuracies": 1.0, "rewards/chosen": 0.9194298982620239, "rewards/margins": 0.127233624458313, "rewards/rejected": 0.7921962738037109, "step": 5976 }, { "epoch": 3.22, "learning_rate": 9.554285284265406e-09, "logits/chosen": -2.089961528778076, "logits/rejected": -2.295490026473999, "logps/chosen": -2.4462549686431885, "logps/rejected": -5.676446914672852, "loss": 0.6435, "rewards/accuracies": 1.0, "rewards/chosen": 0.643732488155365, "rewards/margins": 0.10180598497390747, "rewards/rejected": 0.5419265031814575, "step": 5977 }, { "epoch": 3.22, "learning_rate": 9.541450091662678e-09, "logits/chosen": -2.2582597732543945, "logits/rejected": -2.3186190128326416, "logps/chosen": -8.03592586517334, "logps/rejected": -25.466930389404297, "loss": 0.4686, "rewards/accuracies": 1.0, "rewards/chosen": 1.1915662288665771, "rewards/margins": 0.5145108699798584, "rewards/rejected": 0.6770553588867188, "step": 5978 }, { "epoch": 3.22, "learning_rate": 9.528622616796161e-09, "logits/chosen": -2.0499143600463867, "logits/rejected": -2.0549838542938232, "logps/chosen": -1.6577786207199097, "logps/rejected": -3.4054102897644043, "loss": 0.4555, "rewards/accuracies": 1.0, "rewards/chosen": 1.0599993467330933, "rewards/margins": 0.5499434471130371, "rewards/rejected": 0.5100558996200562, "step": 5979 }, { "epoch": 3.23, "learning_rate": 9.515802862112788e-09, "logits/chosen": -2.0988662242889404, "logits/rejected": -2.3114683628082275, "logps/chosen": -3.1743719577789307, "logps/rejected": -1.1239352226257324, "loss": 0.7105, "rewards/accuracies": 0.0, "rewards/chosen": 0.7252063751220703, "rewards/margins": -0.03448379039764404, "rewards/rejected": 0.7596901655197144, "step": 5980 }, { "epoch": 3.23, "learning_rate": 9.502990830058016e-09, "logits/chosen": -2.0832974910736084, "logits/rejected": -2.31132435798645, "logps/chosen": -0.08755055069923401, "logps/rejected": -0.10180986672639847, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.862470805644989, "rewards/margins": 0.02691340446472168, "rewards/rejected": 0.8355574011802673, "step": 5981 }, { "epoch": 3.23, "learning_rate": 9.490186523075815e-09, "logits/chosen": -2.3489632606506348, "logits/rejected": -2.099466323852539, "logps/chosen": -45.73342514038086, "logps/rejected": -6.561309814453125, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": 2.2847423553466797, "rewards/margins": 1.8819071054458618, "rewards/rejected": 0.40283527970314026, "step": 5982 }, { "epoch": 3.23, "learning_rate": 9.477389943608698e-09, "logits/chosen": -2.086622476577759, "logits/rejected": -2.319798231124878, "logps/chosen": -2.0878000259399414, "logps/rejected": -2.5269675254821777, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.7295190691947937, "rewards/margins": 0.00014352798461914062, "rewards/rejected": 0.7293755412101746, "step": 5983 }, { "epoch": 3.23, "learning_rate": 9.464601094097702e-09, "logits/chosen": -2.043124198913574, "logits/rejected": -2.055445909500122, "logps/chosen": -2.3106191158294678, "logps/rejected": -2.054278612136841, "loss": 0.4715, "rewards/accuracies": 1.0, "rewards/chosen": 1.0224391222000122, "rewards/margins": 0.5068628191947937, "rewards/rejected": 0.5155763030052185, "step": 5984 }, { "epoch": 3.23, "learning_rate": 9.451819976982372e-09, "logits/chosen": -1.9726271629333496, "logits/rejected": -2.2761240005493164, "logps/chosen": -2.066695213317871, "logps/rejected": -1.8940752744674683, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": 0.706824004650116, "rewards/margins": -0.018842995166778564, "rewards/rejected": 0.7256669998168945, "step": 5985 }, { "epoch": 3.23, "learning_rate": 9.439046594700789e-09, "logits/chosen": -2.1432297229766846, "logits/rejected": -2.144430160522461, "logps/chosen": -3.145904779434204, "logps/rejected": -1.1789638996124268, "loss": 0.5411, "rewards/accuracies": 1.0, "rewards/chosen": 1.0281132459640503, "rewards/margins": 0.3315476179122925, "rewards/rejected": 0.6965656280517578, "step": 5986 }, { "epoch": 3.23, "learning_rate": 9.42628094968958e-09, "logits/chosen": -2.2214410305023193, "logits/rejected": -2.0862581729888916, "logps/chosen": -50.051353454589844, "logps/rejected": -5.105603218078613, "loss": 0.1019, "rewards/accuracies": 1.0, "rewards/chosen": 2.886164903640747, "rewards/margins": 2.232725143432617, "rewards/rejected": 0.6534397006034851, "step": 5987 }, { "epoch": 3.23, "learning_rate": 9.413523044383864e-09, "logits/chosen": -2.1943178176879883, "logits/rejected": -2.199187994003296, "logps/chosen": -2.725682258605957, "logps/rejected": -6.625855445861816, "loss": 0.3345, "rewards/accuracies": 1.0, "rewards/chosen": 1.767922282218933, "rewards/margins": 0.9233127236366272, "rewards/rejected": 0.8446095585823059, "step": 5988 }, { "epoch": 3.23, "learning_rate": 9.400772881217296e-09, "logits/chosen": -2.022538185119629, "logits/rejected": -2.05910062789917, "logps/chosen": -8.102575302124023, "logps/rejected": -16.239938735961914, "loss": 0.4388, "rewards/accuracies": 1.0, "rewards/chosen": 1.216524362564087, "rewards/margins": 0.5961704850196838, "rewards/rejected": 0.6203538775444031, "step": 5989 }, { "epoch": 3.23, "learning_rate": 9.388030462622055e-09, "logits/chosen": -1.999221920967102, "logits/rejected": -2.0029423236846924, "logps/chosen": -1.7861676216125488, "logps/rejected": -2.9322667121887207, "loss": 0.5492, "rewards/accuracies": 1.0, "rewards/chosen": 1.0355892181396484, "rewards/margins": 0.3120930790901184, "rewards/rejected": 0.72349613904953, "step": 5990 }, { "epoch": 3.23, "learning_rate": 9.375295791028842e-09, "logits/chosen": -2.0496912002563477, "logits/rejected": -2.047412872314453, "logps/chosen": -6.561138153076172, "logps/rejected": -5.793307781219482, "loss": 0.3313, "rewards/accuracies": 1.0, "rewards/chosen": 1.2485336065292358, "rewards/margins": 0.9343724250793457, "rewards/rejected": 0.31416115164756775, "step": 5991 }, { "epoch": 3.23, "learning_rate": 9.362568868866893e-09, "logits/chosen": -2.042917490005493, "logits/rejected": -2.045173168182373, "logps/chosen": -0.11237189918756485, "logps/rejected": -7.131188869476318, "loss": 0.4443, "rewards/accuracies": 1.0, "rewards/chosen": 0.7731722593307495, "rewards/margins": 0.5810249447822571, "rewards/rejected": 0.19214729964733124, "step": 5992 }, { "epoch": 3.23, "learning_rate": 9.349849698563928e-09, "logits/chosen": -1.9682093858718872, "logits/rejected": -1.9806939363479614, "logps/chosen": -1.1958340406417847, "logps/rejected": -6.935793399810791, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 1.2409553527832031, "rewards/margins": 0.6406521201133728, "rewards/rejected": 0.6003032326698303, "step": 5993 }, { "epoch": 3.23, "learning_rate": 9.337138282546225e-09, "logits/chosen": -2.1011714935302734, "logits/rejected": -2.283224582672119, "logps/chosen": -1.2452698945999146, "logps/rejected": -1.1579780578613281, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.7131255269050598, "rewards/margins": 0.0007936358451843262, "rewards/rejected": 0.7123318910598755, "step": 5994 }, { "epoch": 3.23, "learning_rate": 9.32443462323857e-09, "logits/chosen": -2.058746814727783, "logits/rejected": -2.1280481815338135, "logps/chosen": -2.5245165824890137, "logps/rejected": -25.919166564941406, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": 1.6830288171768188, "rewards/margins": 1.785386323928833, "rewards/rejected": -0.10235748440027237, "step": 5995 }, { "epoch": 3.23, "learning_rate": 9.311738723064266e-09, "logits/chosen": -2.175731658935547, "logits/rejected": -2.1798152923583984, "logps/chosen": -2.3309569358825684, "logps/rejected": -8.902596473693848, "loss": 0.4175, "rewards/accuracies": 1.0, "rewards/chosen": 1.4513708353042603, "rewards/margins": 0.6574862599372864, "rewards/rejected": 0.7938845753669739, "step": 5996 }, { "epoch": 3.23, "learning_rate": 9.299050584445134e-09, "logits/chosen": -2.101849317550659, "logits/rejected": -2.335644006729126, "logps/chosen": -5.240625381469727, "logps/rejected": -4.428808212280273, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.9644106030464172, "rewards/margins": -0.012762248516082764, "rewards/rejected": 0.9771728515625, "step": 5997 }, { "epoch": 3.24, "learning_rate": 9.286370209801541e-09, "logits/chosen": -2.1234586238861084, "logits/rejected": -2.1320643424987793, "logps/chosen": -4.057737350463867, "logps/rejected": -7.523273468017578, "loss": 0.3007, "rewards/accuracies": 1.0, "rewards/chosen": 1.5241259336471558, "rewards/margins": 1.04735267162323, "rewards/rejected": 0.4767732620239258, "step": 5998 }, { "epoch": 3.24, "learning_rate": 9.273697601552344e-09, "logits/chosen": -2.2172648906707764, "logits/rejected": -2.2162013053894043, "logps/chosen": -0.8386802673339844, "logps/rejected": -7.989666938781738, "loss": 0.3762, "rewards/accuracies": 1.0, "rewards/chosen": 1.0600441694259644, "rewards/margins": 0.7837214469909668, "rewards/rejected": 0.27632275223731995, "step": 5999 }, { "epoch": 3.24, "learning_rate": 9.261032762114923e-09, "logits/chosen": -2.0146026611328125, "logits/rejected": -2.016077995300293, "logps/chosen": -0.6455115675926208, "logps/rejected": -4.119332790374756, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 0.992050290107727, "rewards/margins": 0.4240659475326538, "rewards/rejected": 0.5679843425750732, "step": 6000 }, { "epoch": 3.24, "learning_rate": 9.248375693905181e-09, "logits/chosen": -2.092430353164673, "logits/rejected": -2.0901145935058594, "logps/chosen": -0.8376855254173279, "logps/rejected": -2.1998863220214844, "loss": 0.5575, "rewards/accuracies": 1.0, "rewards/chosen": 1.0299423933029175, "rewards/margins": 0.2925730347633362, "rewards/rejected": 0.7373693585395813, "step": 6001 }, { "epoch": 3.24, "learning_rate": 9.23572639933754e-09, "logits/chosen": -2.1777942180633545, "logits/rejected": -2.2909913063049316, "logps/chosen": -9.884722709655762, "logps/rejected": -3.321760654449463, "loss": 1.0531, "rewards/accuracies": 0.0, "rewards/chosen": 0.15953350067138672, "rewards/margins": -0.6241249442100525, "rewards/rejected": 0.7836584448814392, "step": 6002 }, { "epoch": 3.24, "learning_rate": 9.223084880824933e-09, "logits/chosen": -1.9978731870651245, "logits/rejected": -2.315502882003784, "logps/chosen": -1.1908888816833496, "logps/rejected": -0.9913545846939087, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.7188172936439514, "rewards/margins": 0.012586414813995361, "rewards/rejected": 0.706230878829956, "step": 6003 }, { "epoch": 3.24, "learning_rate": 9.210451140778819e-09, "logits/chosen": -1.9702144861221313, "logits/rejected": -2.284600257873535, "logps/chosen": -1.24951171875, "logps/rejected": -1.1978492736816406, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8970080614089966, "rewards/margins": 0.0020999908447265625, "rewards/rejected": 0.89490807056427, "step": 6004 }, { "epoch": 3.24, "learning_rate": 9.197825181609164e-09, "logits/chosen": -1.9987473487854004, "logits/rejected": -2.334441900253296, "logps/chosen": -0.3903644382953644, "logps/rejected": -0.42836087942123413, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.9153319597244263, "rewards/margins": 0.02470839023590088, "rewards/rejected": 0.8906235694885254, "step": 6005 }, { "epoch": 3.24, "learning_rate": 9.18520700572445e-09, "logits/chosen": -2.1092119216918945, "logits/rejected": -2.097414016723633, "logps/chosen": -6.599776268005371, "logps/rejected": -2.9624292850494385, "loss": 0.4456, "rewards/accuracies": 1.0, "rewards/chosen": 1.161284327507019, "rewards/margins": 0.5772406458854675, "rewards/rejected": 0.5840436816215515, "step": 6006 }, { "epoch": 3.24, "learning_rate": 9.17259661553168e-09, "logits/chosen": -2.116952657699585, "logits/rejected": -2.081807851791382, "logps/chosen": -16.025835037231445, "logps/rejected": -3.6346170902252197, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 1.5435822010040283, "rewards/margins": 0.9122277498245239, "rewards/rejected": 0.6313544511795044, "step": 6007 }, { "epoch": 3.24, "learning_rate": 9.159994013436373e-09, "logits/chosen": -2.062338352203369, "logits/rejected": -2.0703823566436768, "logps/chosen": -1.0996390581130981, "logps/rejected": -3.6597321033477783, "loss": 0.451, "rewards/accuracies": 1.0, "rewards/chosen": 1.0755542516708374, "rewards/margins": 0.562263548374176, "rewards/rejected": 0.5132907032966614, "step": 6008 }, { "epoch": 3.24, "learning_rate": 9.147399201842554e-09, "logits/chosen": -2.215299606323242, "logits/rejected": -2.1940290927886963, "logps/chosen": -14.456578254699707, "logps/rejected": -1.4204599857330322, "loss": 0.3367, "rewards/accuracies": 1.0, "rewards/chosen": 1.6142570972442627, "rewards/margins": 0.9155016541481018, "rewards/rejected": 0.6987554430961609, "step": 6009 }, { "epoch": 3.24, "learning_rate": 9.134812183152768e-09, "logits/chosen": -2.027683973312378, "logits/rejected": -2.032947063446045, "logps/chosen": -1.519060730934143, "logps/rejected": -4.237369537353516, "loss": 0.4643, "rewards/accuracies": 1.0, "rewards/chosen": 0.9913527369499207, "rewards/margins": 0.5260798931121826, "rewards/rejected": 0.46527281403541565, "step": 6010 }, { "epoch": 3.24, "learning_rate": 9.122232959768073e-09, "logits/chosen": -2.150665044784546, "logits/rejected": -2.0608294010162354, "logps/chosen": -12.952810287475586, "logps/rejected": -5.876465320587158, "loss": 0.2102, "rewards/accuracies": 1.0, "rewards/chosen": 1.8402273654937744, "rewards/margins": 1.45282781124115, "rewards/rejected": 0.3873995244503021, "step": 6011 }, { "epoch": 3.24, "learning_rate": 9.109661534088042e-09, "logits/chosen": -2.1104042530059814, "logits/rejected": -2.1125833988189697, "logps/chosen": -2.387749671936035, "logps/rejected": -3.182650327682495, "loss": 0.5484, "rewards/accuracies": 1.0, "rewards/chosen": 1.18075692653656, "rewards/margins": 0.31412482261657715, "rewards/rejected": 0.8666321039199829, "step": 6012 }, { "epoch": 3.24, "learning_rate": 9.097097908510747e-09, "logits/chosen": -2.1110916137695312, "logits/rejected": -2.1078600883483887, "logps/chosen": -0.7869822382926941, "logps/rejected": -4.245680809020996, "loss": 0.4244, "rewards/accuracies": 1.0, "rewards/chosen": 1.1569606065750122, "rewards/margins": 0.6374489068984985, "rewards/rejected": 0.5195116996765137, "step": 6013 }, { "epoch": 3.24, "learning_rate": 9.084542085432795e-09, "logits/chosen": -2.0780396461486816, "logits/rejected": -2.0831851959228516, "logps/chosen": -0.39461782574653625, "logps/rejected": -14.184259414672852, "loss": 0.4286, "rewards/accuracies": 1.0, "rewards/chosen": 1.0516823530197144, "rewards/margins": 0.6253790855407715, "rewards/rejected": 0.42630329728126526, "step": 6014 }, { "epoch": 3.24, "learning_rate": 9.071994067249289e-09, "logits/chosen": -2.083186149597168, "logits/rejected": -2.082308292388916, "logps/chosen": -0.9160618185997009, "logps/rejected": -3.553852081298828, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 1.039049506187439, "rewards/margins": 0.27582842111587524, "rewards/rejected": 0.7632210850715637, "step": 6015 }, { "epoch": 3.24, "learning_rate": 9.059453856353844e-09, "logits/chosen": -2.0075881481170654, "logits/rejected": -2.0168893337249756, "logps/chosen": -1.1725544929504395, "logps/rejected": -3.625845193862915, "loss": 0.4431, "rewards/accuracies": 1.0, "rewards/chosen": 1.0521032810211182, "rewards/margins": 0.5841259360313416, "rewards/rejected": 0.4679773449897766, "step": 6016 }, { "epoch": 3.25, "learning_rate": 9.04692145513859e-09, "logits/chosen": -2.148540735244751, "logits/rejected": -2.152613878250122, "logps/chosen": -1.9671587944030762, "logps/rejected": -14.268548011779785, "loss": 0.2473, "rewards/accuracies": 1.0, "rewards/chosen": 1.586910605430603, "rewards/margins": 1.2711071968078613, "rewards/rejected": 0.3158034384250641, "step": 6017 }, { "epoch": 3.25, "learning_rate": 9.034396865994166e-09, "logits/chosen": -2.176403045654297, "logits/rejected": -2.318357467651367, "logps/chosen": -5.389147758483887, "logps/rejected": -5.591634750366211, "loss": 0.6677, "rewards/accuracies": 1.0, "rewards/chosen": 0.8692727088928223, "rewards/margins": 0.051554977893829346, "rewards/rejected": 0.8177177309989929, "step": 6018 }, { "epoch": 3.25, "learning_rate": 9.021880091309703e-09, "logits/chosen": -2.0259909629821777, "logits/rejected": -2.038853406906128, "logps/chosen": -25.956024169921875, "logps/rejected": -19.447086334228516, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 1.195913314819336, "rewards/margins": 0.6347600817680359, "rewards/rejected": 0.5611532330513, "step": 6019 }, { "epoch": 3.25, "learning_rate": 9.009371133472887e-09, "logits/chosen": -2.139949321746826, "logits/rejected": -2.1409003734588623, "logps/chosen": -0.12814432382583618, "logps/rejected": -6.721295356750488, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 1.0669137239456177, "rewards/margins": 0.8445543646812439, "rewards/rejected": 0.22235937416553497, "step": 6020 }, { "epoch": 3.25, "learning_rate": 8.996869994869876e-09, "logits/chosen": -1.9718495607376099, "logits/rejected": -2.293452024459839, "logps/chosen": -0.41900011897087097, "logps/rejected": -0.4732750654220581, "loss": 0.713, "rewards/accuracies": 0.0, "rewards/chosen": 1.0816112756729126, "rewards/margins": -0.039400458335876465, "rewards/rejected": 1.121011734008789, "step": 6021 }, { "epoch": 3.25, "learning_rate": 8.984376677885352e-09, "logits/chosen": -2.1204283237457275, "logits/rejected": -2.3568973541259766, "logps/chosen": -1.2793543338775635, "logps/rejected": -1.3064639568328857, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 1.083417534828186, "rewards/margins": 0.0017510652542114258, "rewards/rejected": 1.0816664695739746, "step": 6022 }, { "epoch": 3.25, "learning_rate": 8.971891184902474e-09, "logits/chosen": -2.047316789627075, "logits/rejected": -2.315157651901245, "logps/chosen": -6.7764506340026855, "logps/rejected": -6.156792640686035, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 1.0029884576797485, "rewards/margins": -0.0005068778991699219, "rewards/rejected": 1.0034953355789185, "step": 6023 }, { "epoch": 3.25, "learning_rate": 8.959413518302945e-09, "logits/chosen": -2.159514904022217, "logits/rejected": -2.1541779041290283, "logps/chosen": -2.0928399562835693, "logps/rejected": -4.525806903839111, "loss": 0.3292, "rewards/accuracies": 1.0, "rewards/chosen": 1.5976651906967163, "rewards/margins": 0.9418728947639465, "rewards/rejected": 0.6557922959327698, "step": 6024 }, { "epoch": 3.25, "learning_rate": 8.946943680466968e-09, "logits/chosen": -2.057353973388672, "logits/rejected": -2.04447603225708, "logps/chosen": -4.563048362731934, "logps/rejected": -3.6694536209106445, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 0.898582398891449, "rewards/margins": 0.4382884204387665, "rewards/rejected": 0.4602939784526825, "step": 6025 }, { "epoch": 3.25, "learning_rate": 8.934481673773237e-09, "logits/chosen": -1.9673278331756592, "logits/rejected": -1.981799602508545, "logps/chosen": -1.5924068689346313, "logps/rejected": -4.09820032119751, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 0.8728199005126953, "rewards/margins": 0.4262925684452057, "rewards/rejected": 0.4465273320674896, "step": 6026 }, { "epoch": 3.25, "learning_rate": 8.922027500598966e-09, "logits/chosen": -2.062408924102783, "logits/rejected": -2.288336992263794, "logps/chosen": -1.639729619026184, "logps/rejected": -1.9269137382507324, "loss": 0.5522, "rewards/accuracies": 1.0, "rewards/chosen": 1.1413638591766357, "rewards/margins": 0.3049878478050232, "rewards/rejected": 0.8363760113716125, "step": 6027 }, { "epoch": 3.25, "learning_rate": 8.909581163319874e-09, "logits/chosen": -2.0543782711029053, "logits/rejected": -2.2948286533355713, "logps/chosen": -0.38459646701812744, "logps/rejected": -0.42371824383735657, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.8525042533874512, "rewards/margins": 0.027547061443328857, "rewards/rejected": 0.8249571919441223, "step": 6028 }, { "epoch": 3.25, "learning_rate": 8.897142664310182e-09, "logits/chosen": -2.121807098388672, "logits/rejected": -2.1203718185424805, "logps/chosen": -6.700240135192871, "logps/rejected": -3.7670562267303467, "loss": 0.3687, "rewards/accuracies": 1.0, "rewards/chosen": 1.355260968208313, "rewards/margins": 0.8078333139419556, "rewards/rejected": 0.5474276542663574, "step": 6029 }, { "epoch": 3.25, "learning_rate": 8.884712005942618e-09, "logits/chosen": -2.09454083442688, "logits/rejected": -2.314319372177124, "logps/chosen": -0.3480202555656433, "logps/rejected": -0.34325867891311646, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 0.8961132168769836, "rewards/margins": -0.018460869789123535, "rewards/rejected": 0.9145740866661072, "step": 6030 }, { "epoch": 3.25, "learning_rate": 8.87228919058839e-09, "logits/chosen": -2.102093458175659, "logits/rejected": -2.101069927215576, "logps/chosen": -2.7271347045898438, "logps/rejected": -3.3845105171203613, "loss": 0.4634, "rewards/accuracies": 1.0, "rewards/chosen": 1.203548789024353, "rewards/margins": 0.5285111665725708, "rewards/rejected": 0.6750376224517822, "step": 6031 }, { "epoch": 3.25, "learning_rate": 8.859874220617269e-09, "logits/chosen": -2.1398017406463623, "logits/rejected": -2.09633207321167, "logps/chosen": -18.62174415588379, "logps/rejected": -9.067741394042969, "loss": 0.3304, "rewards/accuracies": 1.0, "rewards/chosen": 1.639589548110962, "rewards/margins": 0.9378515481948853, "rewards/rejected": 0.7017379999160767, "step": 6032 }, { "epoch": 3.25, "learning_rate": 8.847467098397471e-09, "logits/chosen": -2.118255376815796, "logits/rejected": -2.121622323989868, "logps/chosen": -0.6131746768951416, "logps/rejected": -1.991983413696289, "loss": 0.5559, "rewards/accuracies": 1.0, "rewards/chosen": 0.9475496411323547, "rewards/margins": 0.29626721143722534, "rewards/rejected": 0.6512824296951294, "step": 6033 }, { "epoch": 3.25, "learning_rate": 8.835067826295745e-09, "logits/chosen": -2.0645387172698975, "logits/rejected": -2.295018434524536, "logps/chosen": -1.2133710384368896, "logps/rejected": -1.1498132944107056, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.7836823463439941, "rewards/margins": 0.017025530338287354, "rewards/rejected": 0.7666568160057068, "step": 6034 }, { "epoch": 3.26, "learning_rate": 8.822676406677326e-09, "logits/chosen": -2.170518159866333, "logits/rejected": -2.305758476257324, "logps/chosen": -0.1441386491060257, "logps/rejected": -0.15214821696281433, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.9599095582962036, "rewards/margins": 0.027988910675048828, "rewards/rejected": 0.9319206476211548, "step": 6035 }, { "epoch": 3.26, "learning_rate": 8.810292841905964e-09, "logits/chosen": -2.1584551334381104, "logits/rejected": -2.1638314723968506, "logps/chosen": -3.701040267944336, "logps/rejected": -10.83923053741455, "loss": 0.3042, "rewards/accuracies": 1.0, "rewards/chosen": 1.258042335510254, "rewards/margins": 1.0342373847961426, "rewards/rejected": 0.22380495071411133, "step": 6036 }, { "epoch": 3.26, "learning_rate": 8.797917134343908e-09, "logits/chosen": -2.0076236724853516, "logits/rejected": -2.292426347732544, "logps/chosen": -0.33082684874534607, "logps/rejected": -0.3055538833141327, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8113781809806824, "rewards/margins": 0.012079358100891113, "rewards/rejected": 0.7992988228797913, "step": 6037 }, { "epoch": 3.26, "learning_rate": 8.7855492863519e-09, "logits/chosen": -2.1040713787078857, "logits/rejected": -2.1091086864471436, "logps/chosen": -1.792772650718689, "logps/rejected": -3.927773952484131, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 1.3677959442138672, "rewards/margins": 0.6774149537086487, "rewards/rejected": 0.6903809905052185, "step": 6038 }, { "epoch": 3.26, "learning_rate": 8.773189300289208e-09, "logits/chosen": -2.1161389350891113, "logits/rejected": -2.2986273765563965, "logps/chosen": -0.2541084885597229, "logps/rejected": -0.24437998235225677, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9445063471794128, "rewards/margins": 0.013528525829315186, "rewards/rejected": 0.9309778213500977, "step": 6039 }, { "epoch": 3.26, "learning_rate": 8.760837178513553e-09, "logits/chosen": -2.23502254486084, "logits/rejected": -2.229645013809204, "logps/chosen": -5.393914699554443, "logps/rejected": -2.8924827575683594, "loss": 0.4181, "rewards/accuracies": 1.0, "rewards/chosen": 1.219728708267212, "rewards/margins": 0.6557486653327942, "rewards/rejected": 0.5639800429344177, "step": 6040 }, { "epoch": 3.26, "learning_rate": 8.74849292338119e-09, "logits/chosen": -2.065554618835449, "logits/rejected": -2.293137311935425, "logps/chosen": -0.23695649206638336, "logps/rejected": -0.2454095035791397, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.9123696684837341, "rewards/margins": -0.0077724456787109375, "rewards/rejected": 0.9201421141624451, "step": 6041 }, { "epoch": 3.26, "learning_rate": 8.736156537246869e-09, "logits/chosen": -2.1019093990325928, "logits/rejected": -2.293309211730957, "logps/chosen": -17.87759780883789, "logps/rejected": -14.687408447265625, "loss": 0.6431, "rewards/accuracies": 1.0, "rewards/chosen": 0.4357963502407074, "rewards/margins": 0.10281142592430115, "rewards/rejected": 0.33298492431640625, "step": 6042 }, { "epoch": 3.26, "learning_rate": 8.723828022463847e-09, "logits/chosen": -2.1368350982666016, "logits/rejected": -2.293715715408325, "logps/chosen": -2.341846227645874, "logps/rejected": -1.2205810546875, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 1.0300121307373047, "rewards/margins": 0.06729370355606079, "rewards/rejected": 0.9627184271812439, "step": 6043 }, { "epoch": 3.26, "learning_rate": 8.711507381383871e-09, "logits/chosen": -1.999653935432434, "logits/rejected": -2.228586435317993, "logps/chosen": -0.3569992184638977, "logps/rejected": -0.388640820980072, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.9709196090698242, "rewards/margins": 0.00852668285369873, "rewards/rejected": 0.9623929262161255, "step": 6044 }, { "epoch": 3.26, "learning_rate": 8.69919461635718e-09, "logits/chosen": -1.9517815113067627, "logits/rejected": -2.2860662937164307, "logps/chosen": -0.512884795665741, "logps/rejected": -0.6090512275695801, "loss": 0.6651, "rewards/accuracies": 1.0, "rewards/chosen": 1.0249048471450806, "rewards/margins": 0.056895673274993896, "rewards/rejected": 0.9680091738700867, "step": 6045 }, { "epoch": 3.26, "learning_rate": 8.68688972973251e-09, "logits/chosen": -2.0354833602905273, "logits/rejected": -2.3355748653411865, "logps/chosen": -4.776132583618164, "logps/rejected": -4.796474456787109, "loss": 0.7035, "rewards/accuracies": 0.0, "rewards/chosen": 1.0063457489013672, "rewards/margins": -0.020604372024536133, "rewards/rejected": 1.0269501209259033, "step": 6046 }, { "epoch": 3.26, "learning_rate": 8.674592723857106e-09, "logits/chosen": -2.080317497253418, "logits/rejected": -2.0787556171417236, "logps/chosen": -0.6504859924316406, "logps/rejected": -6.409973621368408, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": 1.131778597831726, "rewards/margins": 0.755733847618103, "rewards/rejected": 0.37604472041130066, "step": 6047 }, { "epoch": 3.26, "learning_rate": 8.662303601076699e-09, "logits/chosen": -1.9532887935638428, "logits/rejected": -1.9626225233078003, "logps/chosen": -0.8807985782623291, "logps/rejected": -3.115018367767334, "loss": 0.4504, "rewards/accuracies": 1.0, "rewards/chosen": 1.1166085004806519, "rewards/margins": 0.5641047358512878, "rewards/rejected": 0.552503764629364, "step": 6048 }, { "epoch": 3.26, "learning_rate": 8.65002236373552e-09, "logits/chosen": -2.0275144577026367, "logits/rejected": -2.0288305282592773, "logps/chosen": -0.4209011197090149, "logps/rejected": -5.366751670837402, "loss": 0.4595, "rewards/accuracies": 1.0, "rewards/chosen": 0.8464746475219727, "rewards/margins": 0.5390994548797607, "rewards/rejected": 0.3073751628398895, "step": 6049 }, { "epoch": 3.26, "learning_rate": 8.637749014176304e-09, "logits/chosen": -2.1394569873809814, "logits/rejected": -2.115760326385498, "logps/chosen": -10.270401000976562, "logps/rejected": -5.253230094909668, "loss": 0.442, "rewards/accuracies": 1.0, "rewards/chosen": 1.2895195484161377, "rewards/margins": 0.5873916149139404, "rewards/rejected": 0.7021279335021973, "step": 6050 }, { "epoch": 3.26, "learning_rate": 8.62548355474027e-09, "logits/chosen": -2.031806230545044, "logits/rejected": -2.037057876586914, "logps/chosen": -1.313018560409546, "logps/rejected": -5.281407833099365, "loss": 0.3967, "rewards/accuracies": 1.0, "rewards/chosen": 1.1558679342269897, "rewards/margins": 0.7198119163513184, "rewards/rejected": 0.436055988073349, "step": 6051 }, { "epoch": 3.26, "learning_rate": 8.61322598776713e-09, "logits/chosen": -2.0432677268981934, "logits/rejected": -2.337890625, "logps/chosen": -0.5374138355255127, "logps/rejected": -4.756734848022461, "loss": 0.6362, "rewards/accuracies": 1.0, "rewards/chosen": 1.0693254470825195, "rewards/margins": 0.11738336086273193, "rewards/rejected": 0.9519420862197876, "step": 6052 }, { "epoch": 3.26, "learning_rate": 8.600976315595099e-09, "logits/chosen": -2.115997552871704, "logits/rejected": -2.115224599838257, "logps/chosen": -0.8826766610145569, "logps/rejected": -3.349395751953125, "loss": 0.5488, "rewards/accuracies": 1.0, "rewards/chosen": 1.0158065557479858, "rewards/margins": 0.3131234645843506, "rewards/rejected": 0.7026830911636353, "step": 6053 }, { "epoch": 3.27, "learning_rate": 8.588734540560889e-09, "logits/chosen": -1.936238408088684, "logits/rejected": -2.20580792427063, "logps/chosen": -0.10108110308647156, "logps/rejected": -0.1136237233877182, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.8231809735298157, "rewards/margins": 0.016155123710632324, "rewards/rejected": 0.8070258498191833, "step": 6054 }, { "epoch": 3.27, "learning_rate": 8.576500664999691e-09, "logits/chosen": -2.04007625579834, "logits/rejected": -2.040921449661255, "logps/chosen": -0.9398927688598633, "logps/rejected": -4.198634147644043, "loss": 0.4603, "rewards/accuracies": 1.0, "rewards/chosen": 0.9647396206855774, "rewards/margins": 0.5369272232055664, "rewards/rejected": 0.427812397480011, "step": 6055 }, { "epoch": 3.27, "learning_rate": 8.564274691245204e-09, "logits/chosen": -2.120776653289795, "logits/rejected": -2.288256883621216, "logps/chosen": -3.211301326751709, "logps/rejected": -3.1038544178009033, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 1.0329293012619019, "rewards/margins": -0.0047577619552612305, "rewards/rejected": 1.037687063217163, "step": 6056 }, { "epoch": 3.27, "learning_rate": 8.552056621629611e-09, "logits/chosen": -2.0839788913726807, "logits/rejected": -2.3238701820373535, "logps/chosen": -0.17647331953048706, "logps/rejected": -0.17368893325328827, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.880829930305481, "rewards/margins": 0.028326988220214844, "rewards/rejected": 0.8525029420852661, "step": 6057 }, { "epoch": 3.27, "learning_rate": 8.539846458483585e-09, "logits/chosen": -2.1170432567596436, "logits/rejected": -2.120788335800171, "logps/chosen": -0.5917659401893616, "logps/rejected": -2.426112174987793, "loss": 0.4972, "rewards/accuracies": 1.0, "rewards/chosen": 1.1112247705459595, "rewards/margins": 0.4399462938308716, "rewards/rejected": 0.6712784767150879, "step": 6058 }, { "epoch": 3.27, "learning_rate": 8.527644204136308e-09, "logits/chosen": -2.133861541748047, "logits/rejected": -2.168039560317993, "logps/chosen": -1.4408669471740723, "logps/rejected": -6.1160569190979, "loss": 0.5981, "rewards/accuracies": 1.0, "rewards/chosen": 1.1434658765792847, "rewards/margins": 0.20002371072769165, "rewards/rejected": 0.943442165851593, "step": 6059 }, { "epoch": 3.27, "learning_rate": 8.515449860915425e-09, "logits/chosen": -2.1795191764831543, "logits/rejected": -2.0354740619659424, "logps/chosen": -29.777135848999023, "logps/rejected": -5.524538993835449, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": 1.8808653354644775, "rewards/margins": 1.5088247060775757, "rewards/rejected": 0.37204065918922424, "step": 6060 }, { "epoch": 3.27, "learning_rate": 8.5032634311471e-09, "logits/chosen": -2.060544013977051, "logits/rejected": -2.05976939201355, "logps/chosen": -3.0969314575195312, "logps/rejected": -6.229789733886719, "loss": 0.2815, "rewards/accuracies": 1.0, "rewards/chosen": 1.6176033020019531, "rewards/margins": 1.1236627101898193, "rewards/rejected": 0.49394065141677856, "step": 6061 }, { "epoch": 3.27, "learning_rate": 8.491084917155971e-09, "logits/chosen": -2.127026081085205, "logits/rejected": -2.360391855239868, "logps/chosen": -0.8976722359657288, "logps/rejected": -0.9043307304382324, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.8071770071983337, "rewards/margins": 0.011591911315917969, "rewards/rejected": 0.7955850958824158, "step": 6062 }, { "epoch": 3.27, "learning_rate": 8.478914321265168e-09, "logits/chosen": -2.0484588146209717, "logits/rejected": -2.285325527191162, "logps/chosen": -2.725677967071533, "logps/rejected": -3.176647186279297, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.7893704175949097, "rewards/margins": 0.03948122262954712, "rewards/rejected": 0.7498891949653625, "step": 6063 }, { "epoch": 3.27, "learning_rate": 8.466751645796305e-09, "logits/chosen": -1.9521702527999878, "logits/rejected": -1.968569040298462, "logps/chosen": -2.3017232418060303, "logps/rejected": -5.459889888763428, "loss": 0.5449, "rewards/accuracies": 1.0, "rewards/chosen": 1.1392991542816162, "rewards/margins": 0.32225489616394043, "rewards/rejected": 0.8170442581176758, "step": 6064 }, { "epoch": 3.27, "learning_rate": 8.454596893069516e-09, "logits/chosen": -2.1054093837738037, "logits/rejected": -2.33034086227417, "logps/chosen": -7.2535505294799805, "logps/rejected": -13.55494499206543, "loss": 0.7139, "rewards/accuracies": 0.0, "rewards/chosen": 0.9522141814231873, "rewards/margins": -0.04108625650405884, "rewards/rejected": 0.9933004379272461, "step": 6065 }, { "epoch": 3.27, "learning_rate": 8.442450065403384e-09, "logits/chosen": -2.0461037158966064, "logits/rejected": -2.0451982021331787, "logps/chosen": -0.24329525232315063, "logps/rejected": -6.142204284667969, "loss": 0.4269, "rewards/accuracies": 1.0, "rewards/chosen": 0.9640424847602844, "rewards/margins": 0.6302192211151123, "rewards/rejected": 0.3338232934474945, "step": 6066 }, { "epoch": 3.27, "learning_rate": 8.430311165115001e-09, "logits/chosen": -2.085114002227783, "logits/rejected": -2.287843704223633, "logps/chosen": -0.46458137035369873, "logps/rejected": -0.43383023142814636, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.8016787767410278, "rewards/margins": 0.00931161642074585, "rewards/rejected": 0.792367160320282, "step": 6067 }, { "epoch": 3.27, "learning_rate": 8.418180194519952e-09, "logits/chosen": -2.0341973304748535, "logits/rejected": -2.0249264240264893, "logps/chosen": -4.069154739379883, "logps/rejected": -5.735708713531494, "loss": 0.3847, "rewards/accuracies": 1.0, "rewards/chosen": 1.3043981790542603, "rewards/margins": 0.7568132877349854, "rewards/rejected": 0.5475848913192749, "step": 6068 }, { "epoch": 3.27, "learning_rate": 8.406057155932278e-09, "logits/chosen": -2.052363634109497, "logits/rejected": -2.054628610610962, "logps/chosen": -2.219557285308838, "logps/rejected": -1.2288126945495605, "loss": 0.4744, "rewards/accuracies": 1.0, "rewards/chosen": 1.3017702102661133, "rewards/margins": 0.49910199642181396, "rewards/rejected": 0.8026682138442993, "step": 6069 }, { "epoch": 3.27, "learning_rate": 8.393942051664538e-09, "logits/chosen": -2.0770726203918457, "logits/rejected": -2.145319700241089, "logps/chosen": -7.429125785827637, "logps/rejected": -18.367856979370117, "loss": 0.3474, "rewards/accuracies": 1.0, "rewards/chosen": 1.351523756980896, "rewards/margins": 0.8786864876747131, "rewards/rejected": 0.47283726930618286, "step": 6070 }, { "epoch": 3.27, "learning_rate": 8.381834884027772e-09, "logits/chosen": -2.100783586502075, "logits/rejected": -2.1120059490203857, "logps/chosen": -3.023681163787842, "logps/rejected": -3.524228572845459, "loss": 0.2877, "rewards/accuracies": 1.0, "rewards/chosen": 1.5536587238311768, "rewards/margins": 1.0984468460083008, "rewards/rejected": 0.455211877822876, "step": 6071 }, { "epoch": 3.28, "learning_rate": 8.369735655331506e-09, "logits/chosen": -2.172776937484741, "logits/rejected": -2.1478772163391113, "logps/chosen": -19.133981704711914, "logps/rejected": -16.829971313476562, "loss": 0.2045, "rewards/accuracies": 1.0, "rewards/chosen": 2.185790777206421, "rewards/margins": 1.4832146167755127, "rewards/rejected": 0.7025761008262634, "step": 6072 }, { "epoch": 3.28, "learning_rate": 8.357644367883737e-09, "logits/chosen": -2.1363677978515625, "logits/rejected": -2.1382157802581787, "logps/chosen": -0.182044118642807, "logps/rejected": -6.789492607116699, "loss": 0.4036, "rewards/accuracies": 1.0, "rewards/chosen": 0.9315792918205261, "rewards/margins": 0.6987696290016174, "rewards/rejected": 0.2328096479177475, "step": 6073 }, { "epoch": 3.28, "learning_rate": 8.345561023990966e-09, "logits/chosen": -2.0103797912597656, "logits/rejected": -2.0124025344848633, "logps/chosen": -0.12995457649230957, "logps/rejected": -7.215698719024658, "loss": 0.3902, "rewards/accuracies": 1.0, "rewards/chosen": 0.9532279968261719, "rewards/margins": 0.7397688627243042, "rewards/rejected": 0.21345916390419006, "step": 6074 }, { "epoch": 3.28, "learning_rate": 8.33348562595817e-09, "logits/chosen": -2.124495029449463, "logits/rejected": -2.1256635189056396, "logps/chosen": -1.5304985046386719, "logps/rejected": -2.6732301712036133, "loss": 0.5415, "rewards/accuracies": 1.0, "rewards/chosen": 1.1702598333358765, "rewards/margins": 0.3304211497306824, "rewards/rejected": 0.8398386836051941, "step": 6075 }, { "epoch": 3.28, "learning_rate": 8.321418176088796e-09, "logits/chosen": -2.013946294784546, "logits/rejected": -2.029144763946533, "logps/chosen": -3.853764295578003, "logps/rejected": -6.109818935394287, "loss": 0.5928, "rewards/accuracies": 1.0, "rewards/chosen": 0.9214440584182739, "rewards/margins": 0.2119036316871643, "rewards/rejected": 0.7095404267311096, "step": 6076 }, { "epoch": 3.28, "learning_rate": 8.309358676684813e-09, "logits/chosen": -2.087186098098755, "logits/rejected": -2.0863499641418457, "logps/chosen": -4.3346710205078125, "logps/rejected": -3.6702780723571777, "loss": 0.5746, "rewards/accuracies": 1.0, "rewards/chosen": 0.8752729296684265, "rewards/margins": 0.2530171871185303, "rewards/rejected": 0.6222557425498962, "step": 6077 }, { "epoch": 3.28, "learning_rate": 8.297307130046644e-09, "logits/chosen": -2.0396761894226074, "logits/rejected": -2.253973960876465, "logps/chosen": -0.45660659670829773, "logps/rejected": -0.3592628836631775, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9449757933616638, "rewards/margins": 0.02018827199935913, "rewards/rejected": 0.9247875213623047, "step": 6078 }, { "epoch": 3.28, "learning_rate": 8.285263538473203e-09, "logits/chosen": -2.1813344955444336, "logits/rejected": -2.1825459003448486, "logps/chosen": -0.7100799679756165, "logps/rejected": -3.124692678451538, "loss": 0.6073, "rewards/accuracies": 1.0, "rewards/chosen": 1.0022640228271484, "rewards/margins": 0.17967945337295532, "rewards/rejected": 0.8225845694541931, "step": 6079 }, { "epoch": 3.28, "learning_rate": 8.273227904261877e-09, "logits/chosen": -2.108710765838623, "logits/rejected": -2.3214457035064697, "logps/chosen": -5.507113933563232, "logps/rejected": -5.557836055755615, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 1.0721362829208374, "rewards/margins": 0.015161633491516113, "rewards/rejected": 1.0569746494293213, "step": 6080 }, { "epoch": 3.28, "learning_rate": 8.261200229708542e-09, "logits/chosen": -2.0342228412628174, "logits/rejected": -2.0390915870666504, "logps/chosen": -4.75746488571167, "logps/rejected": -1.5691286325454712, "loss": 0.596, "rewards/accuracies": 1.0, "rewards/chosen": 1.0271520614624023, "rewards/margins": 0.20485669374465942, "rewards/rejected": 0.8222953677177429, "step": 6081 }, { "epoch": 3.28, "learning_rate": 8.249180517107569e-09, "logits/chosen": -2.061995029449463, "logits/rejected": -2.284675121307373, "logps/chosen": -1.251524567604065, "logps/rejected": -1.2911841869354248, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.86646968126297, "rewards/margins": 0.0316430926322937, "rewards/rejected": 0.8348265886306763, "step": 6082 }, { "epoch": 3.28, "learning_rate": 8.237168768751784e-09, "logits/chosen": -2.0617692470550537, "logits/rejected": -2.215682029724121, "logps/chosen": -3.7086827754974365, "logps/rejected": -3.239750385284424, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 0.44387704133987427, "rewards/margins": -0.017320573329925537, "rewards/rejected": 0.4611976146697998, "step": 6083 }, { "epoch": 3.28, "learning_rate": 8.225164986932514e-09, "logits/chosen": -2.0226104259490967, "logits/rejected": -2.0316991806030273, "logps/chosen": -1.0597888231277466, "logps/rejected": -3.05536150932312, "loss": 0.3795, "rewards/accuracies": 1.0, "rewards/chosen": 1.363135576248169, "rewards/margins": 0.7732848525047302, "rewards/rejected": 0.5898507237434387, "step": 6084 }, { "epoch": 3.28, "learning_rate": 8.213169173939572e-09, "logits/chosen": -2.1281075477600098, "logits/rejected": -2.1296722888946533, "logps/chosen": -3.5094170570373535, "logps/rejected": -1.7568180561065674, "loss": 0.2487, "rewards/accuracies": 1.0, "rewards/chosen": 1.9116407632827759, "rewards/margins": 1.2645905017852783, "rewards/rejected": 0.6470502018928528, "step": 6085 }, { "epoch": 3.28, "learning_rate": 8.201181332061213e-09, "logits/chosen": -2.0284225940704346, "logits/rejected": -2.0228941440582275, "logps/chosen": -4.217608451843262, "logps/rejected": -1.672701120376587, "loss": 0.3041, "rewards/accuracies": 1.0, "rewards/chosen": 1.6602977514266968, "rewards/margins": 1.03445565700531, "rewards/rejected": 0.6258420944213867, "step": 6086 }, { "epoch": 3.28, "learning_rate": 8.189201463584195e-09, "logits/chosen": -2.134462594985962, "logits/rejected": -2.1494767665863037, "logps/chosen": -2.5594162940979004, "logps/rejected": -5.223567962646484, "loss": 0.443, "rewards/accuracies": 1.0, "rewards/chosen": 1.4174461364746094, "rewards/margins": 0.5844453573226929, "rewards/rejected": 0.8330007791519165, "step": 6087 }, { "epoch": 3.28, "learning_rate": 8.177229570793786e-09, "logits/chosen": -2.0958826541900635, "logits/rejected": -2.3165242671966553, "logps/chosen": -0.836859405040741, "logps/rejected": -0.8273698091506958, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.8684086799621582, "rewards/margins": 0.03221482038497925, "rewards/rejected": 0.836193859577179, "step": 6088 }, { "epoch": 3.28, "learning_rate": 8.165265655973697e-09, "logits/chosen": -2.031984329223633, "logits/rejected": -2.0395355224609375, "logps/chosen": -1.4881595373153687, "logps/rejected": -6.174354076385498, "loss": 0.3531, "rewards/accuracies": 1.0, "rewards/chosen": 1.1192604303359985, "rewards/margins": 0.8592216968536377, "rewards/rejected": 0.26003870368003845, "step": 6089 }, { "epoch": 3.28, "learning_rate": 8.15330972140611e-09, "logits/chosen": -2.078261613845825, "logits/rejected": -2.269087076187134, "logps/chosen": -6.8561859130859375, "logps/rejected": -6.922506332397461, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.7067691683769226, "rewards/margins": 0.0023682117462158203, "rewards/rejected": 0.7044009566307068, "step": 6090 }, { "epoch": 3.29, "learning_rate": 8.141361769371702e-09, "logits/chosen": -2.107747793197632, "logits/rejected": -2.3074252605438232, "logps/chosen": -0.9031499624252319, "logps/rejected": -0.8638973832130432, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 1.0183480978012085, "rewards/margins": -0.0017862319946289062, "rewards/rejected": 1.0201343297958374, "step": 6091 }, { "epoch": 3.29, "learning_rate": 8.129421802149632e-09, "logits/chosen": -2.1224615573883057, "logits/rejected": -2.2894911766052246, "logps/chosen": -0.17242932319641113, "logps/rejected": -0.15987062454223633, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110665321350098, "rewards/margins": 0.0030959248542785645, "rewards/rejected": 0.9079706072807312, "step": 6092 }, { "epoch": 3.29, "learning_rate": 8.117489822017526e-09, "logits/chosen": -2.0334935188293457, "logits/rejected": -2.222055435180664, "logps/chosen": -4.555129528045654, "logps/rejected": -1.143701434135437, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.906467616558075, "rewards/margins": -0.006988167762756348, "rewards/rejected": 0.9134557843208313, "step": 6093 }, { "epoch": 3.29, "learning_rate": 8.105565831251481e-09, "logits/chosen": -1.9894938468933105, "logits/rejected": -2.254753351211548, "logps/chosen": -1.29451322555542, "logps/rejected": -1.051595687866211, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.7907519936561584, "rewards/margins": 0.001978456974029541, "rewards/rejected": 0.7887735366821289, "step": 6094 }, { "epoch": 3.29, "learning_rate": 8.09364983212608e-09, "logits/chosen": -2.0895895957946777, "logits/rejected": -2.2527661323547363, "logps/chosen": -2.51820969581604, "logps/rejected": -3.424934148788452, "loss": 0.6633, "rewards/accuracies": 1.0, "rewards/chosen": 0.8068591952323914, "rewards/margins": 0.060549914836883545, "rewards/rejected": 0.7463092803955078, "step": 6095 }, { "epoch": 3.29, "learning_rate": 8.081741826914384e-09, "logits/chosen": -2.0418436527252197, "logits/rejected": -2.041497230529785, "logps/chosen": -0.9877528548240662, "logps/rejected": -2.86724591255188, "loss": 0.5408, "rewards/accuracies": 1.0, "rewards/chosen": 1.1193684339523315, "rewards/margins": 0.3322479724884033, "rewards/rejected": 0.7871204614639282, "step": 6096 }, { "epoch": 3.29, "learning_rate": 8.069841817887918e-09, "logits/chosen": -2.047863006591797, "logits/rejected": -2.322068929672241, "logps/chosen": -0.36286741495132446, "logps/rejected": -0.3546091914176941, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.8204259872436523, "rewards/margins": 0.024993836879730225, "rewards/rejected": 0.7954321503639221, "step": 6097 }, { "epoch": 3.29, "learning_rate": 8.057949807316694e-09, "logits/chosen": -2.037938356399536, "logits/rejected": -2.035090684890747, "logps/chosen": -2.4773125648498535, "logps/rejected": -4.521069526672363, "loss": 0.277, "rewards/accuracies": 1.0, "rewards/chosen": 1.5702850818634033, "rewards/margins": 1.1421642303466797, "rewards/rejected": 0.42812082171440125, "step": 6098 }, { "epoch": 3.29, "learning_rate": 8.046065797469182e-09, "logits/chosen": -2.20383882522583, "logits/rejected": -2.2009124755859375, "logps/chosen": -8.18934154510498, "logps/rejected": -0.5538177490234375, "loss": 0.6285, "rewards/accuracies": 1.0, "rewards/chosen": 0.8443511128425598, "rewards/margins": 0.13384556770324707, "rewards/rejected": 0.7105055451393127, "step": 6099 }, { "epoch": 3.29, "learning_rate": 8.034189790612344e-09, "logits/chosen": -2.0842955112457275, "logits/rejected": -2.0842533111572266, "logps/chosen": -1.015606164932251, "logps/rejected": -1.6326167583465576, "loss": 0.5187, "rewards/accuracies": 1.0, "rewards/chosen": 1.2635067701339722, "rewards/margins": 0.38595741987228394, "rewards/rejected": 0.8775493502616882, "step": 6100 }, { "epoch": 3.29, "learning_rate": 8.022321789011604e-09, "logits/chosen": -2.1479382514953613, "logits/rejected": -2.3190181255340576, "logps/chosen": -1.8142149448394775, "logps/rejected": -1.7993665933609009, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.7218276262283325, "rewards/margins": 0.017061293125152588, "rewards/rejected": 0.7047663331031799, "step": 6101 }, { "epoch": 3.29, "learning_rate": 8.010461794930862e-09, "logits/chosen": -2.086726665496826, "logits/rejected": -2.2507500648498535, "logps/chosen": -0.6786078214645386, "logps/rejected": -0.7047674655914307, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.8969833254814148, "rewards/margins": 0.01017904281616211, "rewards/rejected": 0.8868042826652527, "step": 6102 }, { "epoch": 3.29, "learning_rate": 7.998609810632484e-09, "logits/chosen": -1.9806458950042725, "logits/rejected": -1.9754571914672852, "logps/chosen": -3.8529772758483887, "logps/rejected": -6.005420684814453, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": 1.6941343545913696, "rewards/margins": 1.2561299800872803, "rewards/rejected": 0.43800440430641174, "step": 6103 }, { "epoch": 3.29, "learning_rate": 7.98676583837733e-09, "logits/chosen": -2.0477116107940674, "logits/rejected": -2.0565783977508545, "logps/chosen": -1.2915207147598267, "logps/rejected": -2.7551119327545166, "loss": 0.4187, "rewards/accuracies": 1.0, "rewards/chosen": 1.2473987340927124, "rewards/margins": 0.65398108959198, "rewards/rejected": 0.5934176445007324, "step": 6104 }, { "epoch": 3.29, "learning_rate": 7.974929880424702e-09, "logits/chosen": -2.07997727394104, "logits/rejected": -2.3152880668640137, "logps/chosen": -7.02318811416626, "logps/rejected": -8.59564208984375, "loss": 0.6098, "rewards/accuracies": 1.0, "rewards/chosen": 0.9845377206802368, "rewards/margins": 0.17435228824615479, "rewards/rejected": 0.810185432434082, "step": 6105 }, { "epoch": 3.29, "learning_rate": 7.963101939032391e-09, "logits/chosen": -2.1066012382507324, "logits/rejected": -2.1085124015808105, "logps/chosen": -3.0948734283447266, "logps/rejected": -0.8169339895248413, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": 1.033679485321045, "rewards/margins": 0.18639975786209106, "rewards/rejected": 0.8472797274589539, "step": 6106 }, { "epoch": 3.29, "learning_rate": 7.951282016456656e-09, "logits/chosen": -2.0283615589141846, "logits/rejected": -2.3166534900665283, "logps/chosen": -1.9738988876342773, "logps/rejected": -1.78233003616333, "loss": 0.7095, "rewards/accuracies": 0.0, "rewards/chosen": 0.6496408581733704, "rewards/margins": -0.03244739770889282, "rewards/rejected": 0.6820882558822632, "step": 6107 }, { "epoch": 3.29, "learning_rate": 7.939470114952224e-09, "logits/chosen": -2.151383399963379, "logits/rejected": -2.1444830894470215, "logps/chosen": -4.812516212463379, "logps/rejected": -7.267309188842773, "loss": 0.3454, "rewards/accuracies": 1.0, "rewards/chosen": 1.0798734426498413, "rewards/margins": 0.8852471113204956, "rewards/rejected": 0.1946263313293457, "step": 6108 }, { "epoch": 3.3, "learning_rate": 7.927666236772285e-09, "logits/chosen": -2.133786916732788, "logits/rejected": -2.30566143989563, "logps/chosen": -3.1909594535827637, "logps/rejected": -3.195194959640503, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.40562787652015686, "rewards/margins": 0.02736571431159973, "rewards/rejected": 0.37826216220855713, "step": 6109 }, { "epoch": 3.3, "learning_rate": 7.915870384168532e-09, "logits/chosen": -2.0533461570739746, "logits/rejected": -2.2586522102355957, "logps/chosen": -2.6740047931671143, "logps/rejected": -3.0497708320617676, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.8395331501960754, "rewards/margins": 0.026656627655029297, "rewards/rejected": 0.8128765225410461, "step": 6110 }, { "epoch": 3.3, "learning_rate": 7.904082559391084e-09, "logits/chosen": -2.1542813777923584, "logits/rejected": -2.130686044692993, "logps/chosen": -7.348146438598633, "logps/rejected": -5.512627601623535, "loss": 0.277, "rewards/accuracies": 1.0, "rewards/chosen": 1.5431774854660034, "rewards/margins": 1.141963243484497, "rewards/rejected": 0.40121421217918396, "step": 6111 }, { "epoch": 3.3, "learning_rate": 7.892302764688546e-09, "logits/chosen": -2.1859664916992188, "logits/rejected": -2.183332920074463, "logps/chosen": -3.298400640487671, "logps/rejected": -3.457442283630371, "loss": 0.3123, "rewards/accuracies": 1.0, "rewards/chosen": 1.562569499015808, "rewards/margins": 1.003591775894165, "rewards/rejected": 0.5589777827262878, "step": 6112 }, { "epoch": 3.3, "learning_rate": 7.880531002308e-09, "logits/chosen": -2.1848905086517334, "logits/rejected": -2.2621347904205322, "logps/chosen": -4.348010063171387, "logps/rejected": -1.8646167516708374, "loss": 0.721, "rewards/accuracies": 0.0, "rewards/chosen": 0.8610860705375671, "rewards/margins": -0.05498737096786499, "rewards/rejected": 0.9160734415054321, "step": 6113 }, { "epoch": 3.3, "learning_rate": 7.86876727449498e-09, "logits/chosen": -2.0075626373291016, "logits/rejected": -2.009063959121704, "logps/chosen": -1.7647173404693604, "logps/rejected": -4.1983418464660645, "loss": 0.5205, "rewards/accuracies": 1.0, "rewards/chosen": 0.8671116232872009, "rewards/margins": 0.3813476264476776, "rewards/rejected": 0.4857639968395233, "step": 6114 }, { "epoch": 3.3, "learning_rate": 7.857011583493517e-09, "logits/chosen": -2.2385101318359375, "logits/rejected": -2.348924398422241, "logps/chosen": -1.9515085220336914, "logps/rejected": -2.270233392715454, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.8526039123535156, "rewards/margins": 0.03438061475753784, "rewards/rejected": 0.8182232975959778, "step": 6115 }, { "epoch": 3.3, "learning_rate": 7.845263931546048e-09, "logits/chosen": -2.030690908432007, "logits/rejected": -2.313338041305542, "logps/chosen": -1.3095506429672241, "logps/rejected": -1.3958373069763184, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 1.1356496810913086, "rewards/margins": -0.016062021255493164, "rewards/rejected": 1.1517117023468018, "step": 6116 }, { "epoch": 3.3, "learning_rate": 7.833524320893536e-09, "logits/chosen": -2.0517702102661133, "logits/rejected": -2.3126113414764404, "logps/chosen": -3.6636624336242676, "logps/rejected": -2.265523910522461, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": 0.858153760433197, "rewards/margins": -0.049009501934051514, "rewards/rejected": 0.9071632623672485, "step": 6117 }, { "epoch": 3.3, "learning_rate": 7.821792753775391e-09, "logits/chosen": -2.0848805904388428, "logits/rejected": -2.2508127689361572, "logps/chosen": -2.324608087539673, "logps/rejected": -2.4522104263305664, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.7458330988883972, "rewards/margins": 0.01991879940032959, "rewards/rejected": 0.7259142994880676, "step": 6118 }, { "epoch": 3.3, "learning_rate": 7.810069232429484e-09, "logits/chosen": -2.114288568496704, "logits/rejected": -2.1514720916748047, "logps/chosen": -4.300820350646973, "logps/rejected": -14.680023193359375, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 1.215512990951538, "rewards/margins": 0.6168004274368286, "rewards/rejected": 0.5987125635147095, "step": 6119 }, { "epoch": 3.3, "learning_rate": 7.798353759092142e-09, "logits/chosen": -2.093017339706421, "logits/rejected": -2.3202037811279297, "logps/chosen": -0.7010043263435364, "logps/rejected": -2.2773969173431396, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 0.9636231660842896, "rewards/margins": 0.14819520711898804, "rewards/rejected": 0.8154279589653015, "step": 6120 }, { "epoch": 3.3, "learning_rate": 7.786646335998192e-09, "logits/chosen": -2.1549484729766846, "logits/rejected": -2.314568042755127, "logps/chosen": -2.771449089050293, "logps/rejected": -3.225393056869507, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.940391480922699, "rewards/margins": 0.03529238700866699, "rewards/rejected": 0.905099093914032, "step": 6121 }, { "epoch": 3.3, "learning_rate": 7.774946965380896e-09, "logits/chosen": -2.183971881866455, "logits/rejected": -2.1831281185150146, "logps/chosen": -0.8839026093482971, "logps/rejected": -8.196331024169922, "loss": 0.5233, "rewards/accuracies": 1.0, "rewards/chosen": 1.2524853944778442, "rewards/margins": 0.37461215257644653, "rewards/rejected": 0.8778732419013977, "step": 6122 }, { "epoch": 3.3, "learning_rate": 7.763255649471984e-09, "logits/chosen": -2.027691602706909, "logits/rejected": -2.2518985271453857, "logps/chosen": -2.782287359237671, "logps/rejected": -2.550020694732666, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.6201385855674744, "rewards/margins": -0.0074236392974853516, "rewards/rejected": 0.6275622248649597, "step": 6123 }, { "epoch": 3.3, "learning_rate": 7.751572390501649e-09, "logits/chosen": -2.227215528488159, "logits/rejected": -2.3168087005615234, "logps/chosen": -11.331320762634277, "logps/rejected": -9.177250862121582, "loss": 0.664, "rewards/accuracies": 1.0, "rewards/chosen": 0.9002379775047302, "rewards/margins": 0.0591244101524353, "rewards/rejected": 0.8411135673522949, "step": 6124 }, { "epoch": 3.3, "learning_rate": 7.739897190698546e-09, "logits/chosen": -2.0705864429473877, "logits/rejected": -2.310586929321289, "logps/chosen": -4.245176315307617, "logps/rejected": -0.5120111703872681, "loss": 0.8093, "rewards/accuracies": 0.0, "rewards/chosen": 0.5093639492988586, "rewards/margins": -0.22025787830352783, "rewards/rejected": 0.7296218276023865, "step": 6125 }, { "epoch": 3.3, "learning_rate": 7.728230052289807e-09, "logits/chosen": -2.243809700012207, "logits/rejected": -2.2384846210479736, "logps/chosen": -0.6774270534515381, "logps/rejected": -5.1268439292907715, "loss": 0.4468, "rewards/accuracies": 1.0, "rewards/chosen": 1.094527244567871, "rewards/margins": 0.5738300681114197, "rewards/rejected": 0.5206971764564514, "step": 6126 }, { "epoch": 3.3, "learning_rate": 7.716570977501013e-09, "logits/chosen": -2.2140543460845947, "logits/rejected": -2.2127015590667725, "logps/chosen": -2.880612850189209, "logps/rejected": -6.104351043701172, "loss": 0.3041, "rewards/accuracies": 1.0, "rewards/chosen": 1.3481454849243164, "rewards/margins": 1.0345642566680908, "rewards/rejected": 0.31358128786087036, "step": 6127 }, { "epoch": 3.31, "learning_rate": 7.704919968556206e-09, "logits/chosen": -2.157074451446533, "logits/rejected": -2.2597367763519287, "logps/chosen": -0.17309272289276123, "logps/rejected": -0.18608492612838745, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.883438229560852, "rewards/margins": 0.011001884937286377, "rewards/rejected": 0.8724363446235657, "step": 6128 }, { "epoch": 3.31, "learning_rate": 7.693277027677897e-09, "logits/chosen": -2.0359551906585693, "logits/rejected": -2.404651165008545, "logps/chosen": -0.7159003615379333, "logps/rejected": -23.86286735534668, "loss": 0.9071, "rewards/accuracies": 0.0, "rewards/chosen": 1.0308068990707397, "rewards/margins": -0.3900657892227173, "rewards/rejected": 1.420872688293457, "step": 6129 }, { "epoch": 3.31, "learning_rate": 7.681642157087048e-09, "logits/chosen": -2.1230313777923584, "logits/rejected": -2.309478282928467, "logps/chosen": -1.9246798753738403, "logps/rejected": -6.22848653793335, "loss": 0.6449, "rewards/accuracies": 1.0, "rewards/chosen": 1.0377250909805298, "rewards/margins": 0.09899753332138062, "rewards/rejected": 0.9387275576591492, "step": 6130 }, { "epoch": 3.31, "learning_rate": 7.670015359003096e-09, "logits/chosen": -2.03875732421875, "logits/rejected": -2.047638416290283, "logps/chosen": -1.2468461990356445, "logps/rejected": -3.3563449382781982, "loss": 0.3197, "rewards/accuracies": 1.0, "rewards/chosen": 1.544002890586853, "rewards/margins": 0.9761185050010681, "rewards/rejected": 0.5678843855857849, "step": 6131 }, { "epoch": 3.31, "learning_rate": 7.658396635643926e-09, "logits/chosen": -2.1443822383880615, "logits/rejected": -2.1507153511047363, "logps/chosen": -2.9769484996795654, "logps/rejected": -16.440847396850586, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 1.0906983613967896, "rewards/margins": 1.1018340587615967, "rewards/rejected": -0.011135674081742764, "step": 6132 }, { "epoch": 3.31, "learning_rate": 7.646785989225885e-09, "logits/chosen": -2.0079586505889893, "logits/rejected": -2.2860965728759766, "logps/chosen": -0.1573919951915741, "logps/rejected": -0.1796734631061554, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.8725468516349792, "rewards/margins": -0.0002906918525695801, "rewards/rejected": 0.8728375434875488, "step": 6133 }, { "epoch": 3.31, "learning_rate": 7.635183421963775e-09, "logits/chosen": -2.108865976333618, "logits/rejected": -2.2262938022613525, "logps/chosen": -0.26512378454208374, "logps/rejected": -7.099555015563965, "loss": 0.6241, "rewards/accuracies": 1.0, "rewards/chosen": 0.8955804109573364, "rewards/margins": 0.14315813779830933, "rewards/rejected": 0.7524222731590271, "step": 6134 }, { "epoch": 3.31, "learning_rate": 7.623588936070874e-09, "logits/chosen": -2.2613565921783447, "logits/rejected": -2.2734217643737793, "logps/chosen": -4.468889236450195, "logps/rejected": -5.206189155578613, "loss": 0.4092, "rewards/accuracies": 1.0, "rewards/chosen": 1.186920404434204, "rewards/margins": 0.6820703744888306, "rewards/rejected": 0.5048500299453735, "step": 6135 }, { "epoch": 3.31, "learning_rate": 7.612002533758905e-09, "logits/chosen": -2.0991556644439697, "logits/rejected": -2.085876941680908, "logps/chosen": -0.23484469950199127, "logps/rejected": -8.138375282287598, "loss": 0.3857, "rewards/accuracies": 1.0, "rewards/chosen": 1.0276843309402466, "rewards/margins": 0.7535908222198486, "rewards/rejected": 0.27409353852272034, "step": 6136 }, { "epoch": 3.31, "learning_rate": 7.600424217238043e-09, "logits/chosen": -2.14259934425354, "logits/rejected": -2.3296499252319336, "logps/chosen": -4.215106010437012, "logps/rejected": -3.9641261100769043, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.7848781943321228, "rewards/margins": -0.0007891654968261719, "rewards/rejected": 0.785667359828949, "step": 6137 }, { "epoch": 3.31, "learning_rate": 7.588853988716936e-09, "logits/chosen": -2.1733896732330322, "logits/rejected": -2.1507973670959473, "logps/chosen": -7.37862491607666, "logps/rejected": -2.442798137664795, "loss": 0.2441, "rewards/accuracies": 1.0, "rewards/chosen": 1.9681370258331299, "rewards/margins": 1.2857921123504639, "rewards/rejected": 0.6823449730873108, "step": 6138 }, { "epoch": 3.31, "learning_rate": 7.577291850402678e-09, "logits/chosen": -2.1448614597320557, "logits/rejected": -2.149000644683838, "logps/chosen": -0.4298156499862671, "logps/rejected": -5.180123805999756, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 1.1082593202590942, "rewards/margins": 0.6931843757629395, "rewards/rejected": 0.4150749742984772, "step": 6139 }, { "epoch": 3.31, "learning_rate": 7.565737804500821e-09, "logits/chosen": -1.9569380283355713, "logits/rejected": -2.3560824394226074, "logps/chosen": -5.154900550842285, "logps/rejected": -5.433063983917236, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.5136904716491699, "rewards/margins": 0.015192359685897827, "rewards/rejected": 0.4984981119632721, "step": 6140 }, { "epoch": 3.31, "learning_rate": 7.554191853215385e-09, "logits/chosen": -2.1659581661224365, "logits/rejected": -2.1629319190979004, "logps/chosen": -3.2139365673065186, "logps/rejected": -3.966773509979248, "loss": 0.5269, "rewards/accuracies": 1.0, "rewards/chosen": 0.8370718359947205, "rewards/margins": 0.3657424747943878, "rewards/rejected": 0.47132936120033264, "step": 6141 }, { "epoch": 3.31, "learning_rate": 7.542653998748827e-09, "logits/chosen": -2.047186851501465, "logits/rejected": -1.990983009338379, "logps/chosen": -28.30617332458496, "logps/rejected": -3.3646504878997803, "loss": 0.1524, "rewards/accuracies": 1.0, "rewards/chosen": 2.3466174602508545, "rewards/margins": 1.8038339614868164, "rewards/rejected": 0.5427834391593933, "step": 6142 }, { "epoch": 3.31, "learning_rate": 7.531124243302061e-09, "logits/chosen": -2.1527466773986816, "logits/rejected": -2.2875938415527344, "logps/chosen": -7.231894016265869, "logps/rejected": -4.0208892822265625, "loss": 0.7362, "rewards/accuracies": 0.0, "rewards/chosen": 0.840493381023407, "rewards/margins": -0.08439898490905762, "rewards/rejected": 0.9248923659324646, "step": 6143 }, { "epoch": 3.31, "learning_rate": 7.519602589074492e-09, "logits/chosen": -2.0735082626342773, "logits/rejected": -2.342214584350586, "logps/chosen": -0.30964043736457825, "logps/rejected": -0.26675379276275635, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.997015655040741, "rewards/margins": 0.018359899520874023, "rewards/rejected": 0.9786557555198669, "step": 6144 }, { "epoch": 3.31, "learning_rate": 7.508089038263943e-09, "logits/chosen": -1.96876859664917, "logits/rejected": -1.9722572565078735, "logps/chosen": -2.4014265537261963, "logps/rejected": -0.551138162612915, "loss": 0.6247, "rewards/accuracies": 1.0, "rewards/chosen": 1.1313122510910034, "rewards/margins": 0.14189499616622925, "rewards/rejected": 0.9894172549247742, "step": 6145 }, { "epoch": 3.31, "learning_rate": 7.496583593066685e-09, "logits/chosen": -2.0438101291656494, "logits/rejected": -2.283064126968384, "logps/chosen": -0.8196503520011902, "logps/rejected": -0.7356212139129639, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.9796454310417175, "rewards/margins": 0.0072533488273620605, "rewards/rejected": 0.9723920822143555, "step": 6146 }, { "epoch": 3.32, "learning_rate": 7.485086255677464e-09, "logits/chosen": -2.11030650138855, "logits/rejected": -2.1093804836273193, "logps/chosen": -2.9969775676727295, "logps/rejected": -4.8459792137146, "loss": 0.3358, "rewards/accuracies": 1.0, "rewards/chosen": 1.418114185333252, "rewards/margins": 0.9186786413192749, "rewards/rejected": 0.49943557381629944, "step": 6147 }, { "epoch": 3.32, "learning_rate": 7.473597028289475e-09, "logits/chosen": -2.087622880935669, "logits/rejected": -2.311964750289917, "logps/chosen": -0.2054833620786667, "logps/rejected": -0.1853937804698944, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.9384048581123352, "rewards/margins": 0.022751331329345703, "rewards/rejected": 0.9156535267829895, "step": 6148 }, { "epoch": 3.32, "learning_rate": 7.462115913094363e-09, "logits/chosen": -2.027277946472168, "logits/rejected": -2.2884809970855713, "logps/chosen": -0.7269498109817505, "logps/rejected": -0.6926652789115906, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.8634971976280212, "rewards/margins": 0.006144344806671143, "rewards/rejected": 0.8573528528213501, "step": 6149 }, { "epoch": 3.32, "learning_rate": 7.45064291228223e-09, "logits/chosen": -2.1118857860565186, "logits/rejected": -2.336282968521118, "logps/chosen": -1.8673481941223145, "logps/rejected": -1.909914255142212, "loss": 0.6666, "rewards/accuracies": 1.0, "rewards/chosen": 1.0319187641143799, "rewards/margins": 0.05372375249862671, "rewards/rejected": 0.9781950116157532, "step": 6150 }, { "epoch": 3.32, "learning_rate": 7.439178028041621e-09, "logits/chosen": -2.0408647060394287, "logits/rejected": -2.349228858947754, "logps/chosen": -6.637117862701416, "logps/rejected": -6.2577619552612305, "loss": 0.7031, "rewards/accuracies": 0.0, "rewards/chosen": 0.7197927236557007, "rewards/margins": -0.019869744777679443, "rewards/rejected": 0.7396624684333801, "step": 6151 }, { "epoch": 3.32, "learning_rate": 7.42772126255955e-09, "logits/chosen": -2.1512577533721924, "logits/rejected": -2.1475024223327637, "logps/chosen": -4.678380489349365, "logps/rejected": -7.34131383895874, "loss": 0.4352, "rewards/accuracies": 1.0, "rewards/chosen": 1.0610811710357666, "rewards/margins": 0.6065236330032349, "rewards/rejected": 0.4545575678348541, "step": 6152 }, { "epoch": 3.32, "learning_rate": 7.416272618021457e-09, "logits/chosen": -2.1144511699676514, "logits/rejected": -2.116262912750244, "logps/chosen": -0.4165741503238678, "logps/rejected": -3.626102924346924, "loss": 0.5287, "rewards/accuracies": 1.0, "rewards/chosen": 0.7850170731544495, "rewards/margins": 0.36146703362464905, "rewards/rejected": 0.4235500395298004, "step": 6153 }, { "epoch": 3.32, "learning_rate": 7.404832096611241e-09, "logits/chosen": -2.049799919128418, "logits/rejected": -2.263922691345215, "logps/chosen": -0.7799826860427856, "logps/rejected": -0.7925975918769836, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.9311191439628601, "rewards/margins": 0.04268538951873779, "rewards/rejected": 0.8884337544441223, "step": 6154 }, { "epoch": 3.32, "learning_rate": 7.3933997005112824e-09, "logits/chosen": -2.0974013805389404, "logits/rejected": -2.0946953296661377, "logps/chosen": -0.142622709274292, "logps/rejected": -7.185563087463379, "loss": 0.4771, "rewards/accuracies": 1.0, "rewards/chosen": 1.0432106256484985, "rewards/margins": 0.49201488494873047, "rewards/rejected": 0.5511957406997681, "step": 6155 }, { "epoch": 3.32, "learning_rate": 7.381975431902371e-09, "logits/chosen": -2.1624605655670166, "logits/rejected": -2.0745859146118164, "logps/chosen": -22.580467224121094, "logps/rejected": -3.4346141815185547, "loss": 0.4122, "rewards/accuracies": 1.0, "rewards/chosen": 1.607441782951355, "rewards/margins": 0.6730231046676636, "rewards/rejected": 0.9344186782836914, "step": 6156 }, { "epoch": 3.32, "learning_rate": 7.370559292963768e-09, "logits/chosen": -2.2097978591918945, "logits/rejected": -2.2164292335510254, "logps/chosen": -0.8728278875350952, "logps/rejected": -2.85284686088562, "loss": 0.4769, "rewards/accuracies": 1.0, "rewards/chosen": 0.9778211712837219, "rewards/margins": 0.4926263988018036, "rewards/rejected": 0.48519477248191833, "step": 6157 }, { "epoch": 3.32, "learning_rate": 7.359151285873172e-09, "logits/chosen": -2.0788283348083496, "logits/rejected": -2.0848867893218994, "logps/chosen": -2.22562837600708, "logps/rejected": -0.765113115310669, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 1.336389183998108, "rewards/margins": 0.487186074256897, "rewards/rejected": 0.8492031097412109, "step": 6158 }, { "epoch": 3.32, "learning_rate": 7.347751412806735e-09, "logits/chosen": -2.1585018634796143, "logits/rejected": -2.1633951663970947, "logps/chosen": -6.889711380004883, "logps/rejected": -6.5453596115112305, "loss": 0.5824, "rewards/accuracies": 1.0, "rewards/chosen": 1.381665587425232, "rewards/margins": 0.23519563674926758, "rewards/rejected": 1.1464699506759644, "step": 6159 }, { "epoch": 3.32, "learning_rate": 7.336359675939063e-09, "logits/chosen": -2.0997838973999023, "logits/rejected": -2.2958126068115234, "logps/chosen": -4.038846492767334, "logps/rejected": -3.8122851848602295, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 1.0891882181167603, "rewards/margins": 0.00029397010803222656, "rewards/rejected": 1.088894248008728, "step": 6160 }, { "epoch": 3.32, "learning_rate": 7.324976077443201e-09, "logits/chosen": -1.9971953630447388, "logits/rejected": -2.266406774520874, "logps/chosen": -0.5619843006134033, "logps/rejected": -3.3115227222442627, "loss": 0.547, "rewards/accuracies": 1.0, "rewards/chosen": 1.0169708728790283, "rewards/margins": 0.3172610402107239, "rewards/rejected": 0.6997098326683044, "step": 6161 }, { "epoch": 3.32, "learning_rate": 7.313600619490656e-09, "logits/chosen": -2.067140817642212, "logits/rejected": -2.295813798904419, "logps/chosen": -0.16597610712051392, "logps/rejected": -0.2274819165468216, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.9056649208068848, "rewards/margins": 0.013049304485321045, "rewards/rejected": 0.8926156163215637, "step": 6162 }, { "epoch": 3.32, "learning_rate": 7.302233304251354e-09, "logits/chosen": -1.9930754899978638, "logits/rejected": -1.9881298542022705, "logps/chosen": -7.364905834197998, "logps/rejected": -3.6487808227539062, "loss": 0.2965, "rewards/accuracies": 1.0, "rewards/chosen": 1.6584819555282593, "rewards/margins": 1.063603401184082, "rewards/rejected": 0.594878613948822, "step": 6163 }, { "epoch": 3.32, "learning_rate": 7.2908741338936984e-09, "logits/chosen": -2.0816917419433594, "logits/rejected": -2.262497901916504, "logps/chosen": -0.6062509417533875, "logps/rejected": -0.7050096392631531, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.7552942633628845, "rewards/margins": 0.03365194797515869, "rewards/rejected": 0.7216423153877258, "step": 6164 }, { "epoch": 3.33, "learning_rate": 7.279523110584507e-09, "logits/chosen": -2.1603407859802246, "logits/rejected": -2.2428815364837646, "logps/chosen": -0.23127326369285583, "logps/rejected": -6.687035083770752, "loss": 0.6011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9531370401382446, "rewards/margins": 0.1935042142868042, "rewards/rejected": 0.7596328258514404, "step": 6165 }, { "epoch": 3.33, "learning_rate": 7.268180236489091e-09, "logits/chosen": -2.1077044010162354, "logits/rejected": -2.1137197017669678, "logps/chosen": -3.4227001667022705, "logps/rejected": -4.024734020233154, "loss": 0.5028, "rewards/accuracies": 1.0, "rewards/chosen": 1.1508616209030151, "rewards/margins": 0.4257124066352844, "rewards/rejected": 0.7251492142677307, "step": 6166 }, { "epoch": 3.33, "learning_rate": 7.2568455137711705e-09, "logits/chosen": -2.0195865631103516, "logits/rejected": -2.007020950317383, "logps/chosen": -5.518650531768799, "logps/rejected": -6.089094161987305, "loss": 0.3069, "rewards/accuracies": 1.0, "rewards/chosen": 1.5831292867660522, "rewards/margins": 1.0237667560577393, "rewards/rejected": 0.5593625903129578, "step": 6167 }, { "epoch": 3.33, "learning_rate": 7.24551894459291e-09, "logits/chosen": -2.039280652999878, "logits/rejected": -2.0403966903686523, "logps/chosen": -2.616778612136841, "logps/rejected": -1.2038778066635132, "loss": 0.4586, "rewards/accuracies": 1.0, "rewards/chosen": 1.4834465980529785, "rewards/margins": 0.5415741801261902, "rewards/rejected": 0.9418724179267883, "step": 6168 }, { "epoch": 3.33, "learning_rate": 7.234200531114931e-09, "logits/chosen": -2.055919647216797, "logits/rejected": -2.0604753494262695, "logps/chosen": -2.615969657897949, "logps/rejected": -2.966762065887451, "loss": 0.3807, "rewards/accuracies": 1.0, "rewards/chosen": 1.223992943763733, "rewards/margins": 0.7693390846252441, "rewards/rejected": 0.45465388894081116, "step": 6169 }, { "epoch": 3.33, "learning_rate": 7.2228902754962965e-09, "logits/chosen": -2.041907787322998, "logits/rejected": -2.0355794429779053, "logps/chosen": -3.310849189758301, "logps/rejected": -3.413658618927002, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 1.6474484205245972, "rewards/margins": 0.9855314493179321, "rewards/rejected": 0.661916971206665, "step": 6170 }, { "epoch": 3.33, "learning_rate": 7.211588179894512e-09, "logits/chosen": -2.0438504219055176, "logits/rejected": -2.032742500305176, "logps/chosen": -5.525899887084961, "logps/rejected": -8.008137702941895, "loss": 0.3033, "rewards/accuracies": 1.0, "rewards/chosen": 1.5858831405639648, "rewards/margins": 1.0377373695373535, "rewards/rejected": 0.5481457710266113, "step": 6171 }, { "epoch": 3.33, "learning_rate": 7.200294246465533e-09, "logits/chosen": -2.158015251159668, "logits/rejected": -2.2073028087615967, "logps/chosen": -5.259210586547852, "logps/rejected": -21.37541961669922, "loss": 0.4557, "rewards/accuracies": 1.0, "rewards/chosen": 1.3550697565078735, "rewards/margins": 0.5495437979698181, "rewards/rejected": 0.8055259585380554, "step": 6172 }, { "epoch": 3.33, "learning_rate": 7.189008477363745e-09, "logits/chosen": -2.210587739944458, "logits/rejected": -2.2199106216430664, "logps/chosen": -2.9655747413635254, "logps/rejected": -7.516932487487793, "loss": 0.2638, "rewards/accuracies": 1.0, "rewards/chosen": 1.3190548419952393, "rewards/margins": 1.1979295015335083, "rewards/rejected": 0.12112531810998917, "step": 6173 }, { "epoch": 3.33, "learning_rate": 7.177730874741983e-09, "logits/chosen": -2.1242616176605225, "logits/rejected": -2.1276512145996094, "logps/chosen": -3.2884230613708496, "logps/rejected": -0.2822961211204529, "loss": 0.5733, "rewards/accuracies": 1.0, "rewards/chosen": 1.2988481521606445, "rewards/margins": 0.2559530735015869, "rewards/rejected": 1.0428950786590576, "step": 6174 }, { "epoch": 3.33, "learning_rate": 7.166461440751525e-09, "logits/chosen": -2.0901036262512207, "logits/rejected": -2.085132360458374, "logps/chosen": -13.659226417541504, "logps/rejected": -11.889368057250977, "loss": 0.7285, "rewards/accuracies": 0.0, "rewards/chosen": 0.8336995244026184, "rewards/margins": -0.06944817304611206, "rewards/rejected": 0.9031476974487305, "step": 6175 }, { "epoch": 3.33, "learning_rate": 7.155200177542098e-09, "logits/chosen": -2.073359489440918, "logits/rejected": -2.0854437351226807, "logps/chosen": -2.7804274559020996, "logps/rejected": -1.1459474563598633, "loss": 0.4428, "rewards/accuracies": 1.0, "rewards/chosen": 1.3424558639526367, "rewards/margins": 0.5850329995155334, "rewards/rejected": 0.7574228644371033, "step": 6176 }, { "epoch": 3.33, "learning_rate": 7.1439470872618535e-09, "logits/chosen": -2.101834774017334, "logits/rejected": -2.1229660511016846, "logps/chosen": -1.5551278591156006, "logps/rejected": -7.171154022216797, "loss": 0.5173, "rewards/accuracies": 1.0, "rewards/chosen": 1.1334283351898193, "rewards/margins": 0.38925468921661377, "rewards/rejected": 0.7441736459732056, "step": 6177 }, { "epoch": 3.33, "learning_rate": 7.1327021720573925e-09, "logits/chosen": -2.0968031883239746, "logits/rejected": -2.292691946029663, "logps/chosen": -3.1517043113708496, "logps/rejected": -7.11411190032959, "loss": 0.7472, "rewards/accuracies": 0.0, "rewards/chosen": 1.0480722188949585, "rewards/margins": -0.10541260242462158, "rewards/rejected": 1.15348482131958, "step": 6178 }, { "epoch": 3.33, "learning_rate": 7.121465434073765e-09, "logits/chosen": -2.1350016593933105, "logits/rejected": -2.347344160079956, "logps/chosen": -1.171980857849121, "logps/rejected": -1.2021760940551758, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.9459190368652344, "rewards/margins": 0.0316777229309082, "rewards/rejected": 0.9142413139343262, "step": 6179 }, { "epoch": 3.33, "learning_rate": 7.110236875454445e-09, "logits/chosen": -2.0608112812042236, "logits/rejected": -2.052530527114868, "logps/chosen": -3.27439546585083, "logps/rejected": -7.8894429206848145, "loss": 0.2521, "rewards/accuracies": 1.0, "rewards/chosen": 1.6372673511505127, "rewards/margins": 1.2490365505218506, "rewards/rejected": 0.3882308602333069, "step": 6180 }, { "epoch": 3.33, "learning_rate": 7.099016498341359e-09, "logits/chosen": -1.9680041074752808, "logits/rejected": -1.93707275390625, "logps/chosen": -8.569364547729492, "logps/rejected": -3.826639175415039, "loss": 0.2427, "rewards/accuracies": 1.0, "rewards/chosen": 1.7618407011032104, "rewards/margins": 1.29196035861969, "rewards/rejected": 0.4698803126811981, "step": 6181 }, { "epoch": 3.33, "learning_rate": 7.0878043048748615e-09, "logits/chosen": -2.081336259841919, "logits/rejected": -2.0691540241241455, "logps/chosen": -12.002975463867188, "logps/rejected": -4.049481391906738, "loss": 0.304, "rewards/accuracies": 1.0, "rewards/chosen": 1.4893463850021362, "rewards/margins": 1.0347261428833008, "rewards/rejected": 0.45462027192115784, "step": 6182 }, { "epoch": 3.33, "learning_rate": 7.0766002971937635e-09, "logits/chosen": -2.0290210247039795, "logits/rejected": -2.035299062728882, "logps/chosen": -4.155140399932861, "logps/rejected": -8.008914947509766, "loss": 0.3751, "rewards/accuracies": 1.0, "rewards/chosen": 0.899666965007782, "rewards/margins": 0.787091076374054, "rewards/rejected": 0.11257591098546982, "step": 6183 }, { "epoch": 3.34, "learning_rate": 7.065404477435299e-09, "logits/chosen": -2.0725841522216797, "logits/rejected": -2.073219060897827, "logps/chosen": -1.018157958984375, "logps/rejected": -1.2266464233398438, "loss": 0.3394, "rewards/accuracies": 1.0, "rewards/chosen": 1.5969810485839844, "rewards/margins": 0.9060815572738647, "rewards/rejected": 0.6908994913101196, "step": 6184 }, { "epoch": 3.34, "learning_rate": 7.054216847735145e-09, "logits/chosen": -2.0909788608551025, "logits/rejected": -2.0825047492980957, "logps/chosen": -5.893669605255127, "logps/rejected": -4.695562362670898, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": 1.2736643552780151, "rewards/margins": 0.6945410370826721, "rewards/rejected": 0.579123318195343, "step": 6185 }, { "epoch": 3.34, "learning_rate": 7.043037410227409e-09, "logits/chosen": -2.1652791500091553, "logits/rejected": -2.047276496887207, "logps/chosen": -8.924051284790039, "logps/rejected": -2.103761911392212, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 1.7954250574111938, "rewards/margins": 0.9919126629829407, "rewards/rejected": 0.8035123944282532, "step": 6186 }, { "epoch": 3.34, "learning_rate": 7.031866167044653e-09, "logits/chosen": -2.01997447013855, "logits/rejected": -2.241471767425537, "logps/chosen": -0.9158185720443726, "logps/rejected": -0.9084084033966064, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 1.0110119581222534, "rewards/margins": -0.0034542083740234375, "rewards/rejected": 1.0144661664962769, "step": 6187 }, { "epoch": 3.34, "learning_rate": 7.020703120317845e-09, "logits/chosen": -2.048530101776123, "logits/rejected": -2.30002760887146, "logps/chosen": -0.8372689485549927, "logps/rejected": -0.8022411465644836, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.8961796760559082, "rewards/margins": 0.010127604007720947, "rewards/rejected": 0.8860520720481873, "step": 6188 }, { "epoch": 3.34, "learning_rate": 7.009548272176441e-09, "logits/chosen": -2.2573835849761963, "logits/rejected": -2.2560136318206787, "logps/chosen": -2.6851532459259033, "logps/rejected": -5.783719539642334, "loss": 0.4313, "rewards/accuracies": 1.0, "rewards/chosen": 0.9545083045959473, "rewards/margins": 0.6175745725631714, "rewards/rejected": 0.33693376183509827, "step": 6189 }, { "epoch": 3.34, "learning_rate": 6.99840162474829e-09, "logits/chosen": -2.1072845458984375, "logits/rejected": -2.116999387741089, "logps/chosen": -0.3274155259132385, "logps/rejected": -12.975058555603027, "loss": 0.5197, "rewards/accuracies": 1.0, "rewards/chosen": 1.0447314977645874, "rewards/margins": 0.38337159156799316, "rewards/rejected": 0.6613599061965942, "step": 6190 }, { "epoch": 3.34, "learning_rate": 6.987263180159691e-09, "logits/chosen": -2.1341865062713623, "logits/rejected": -2.3626933097839355, "logps/chosen": -9.985101699829102, "logps/rejected": -8.288678169250488, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 1.476779580116272, "rewards/margins": 0.5305916666984558, "rewards/rejected": 0.9461879134178162, "step": 6191 }, { "epoch": 3.34, "learning_rate": 6.976132940535362e-09, "logits/chosen": -2.1789212226867676, "logits/rejected": -2.180408239364624, "logps/chosen": -3.9204773902893066, "logps/rejected": -10.316701889038086, "loss": 0.1935, "rewards/accuracies": 1.0, "rewards/chosen": 1.957261323928833, "rewards/margins": 1.5439133644104004, "rewards/rejected": 0.4133480191230774, "step": 6192 }, { "epoch": 3.34, "learning_rate": 6.965010907998481e-09, "logits/chosen": -2.2447404861450195, "logits/rejected": -2.089756727218628, "logps/chosen": -32.48394775390625, "logps/rejected": -1.551634669303894, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": 3.0109708309173584, "rewards/margins": 2.0412638187408447, "rewards/rejected": 0.9697069525718689, "step": 6193 }, { "epoch": 3.34, "learning_rate": 6.953897084670646e-09, "logits/chosen": -2.17071533203125, "logits/rejected": -2.137641191482544, "logps/chosen": -24.687410354614258, "logps/rejected": -5.564334869384766, "loss": 0.4668, "rewards/accuracies": 1.0, "rewards/chosen": 1.5447981357574463, "rewards/margins": 0.5193420648574829, "rewards/rejected": 1.0254560708999634, "step": 6194 }, { "epoch": 3.34, "learning_rate": 6.942791472671894e-09, "logits/chosen": -2.096322774887085, "logits/rejected": -2.06619930267334, "logps/chosen": -5.026749134063721, "logps/rejected": -4.990520477294922, "loss": 0.3916, "rewards/accuracies": 1.0, "rewards/chosen": 1.2957584857940674, "rewards/margins": 0.7352352738380432, "rewards/rejected": 0.5605232119560242, "step": 6195 }, { "epoch": 3.34, "learning_rate": 6.931694074120697e-09, "logits/chosen": -2.2313356399536133, "logits/rejected": -2.226224422454834, "logps/chosen": -0.10383611172437668, "logps/rejected": -13.014833450317383, "loss": 0.301, "rewards/accuracies": 1.0, "rewards/chosen": 0.9483535885810852, "rewards/margins": 1.0463027954101562, "rewards/rejected": -0.09794922173023224, "step": 6196 }, { "epoch": 3.34, "learning_rate": 6.920604891133947e-09, "logits/chosen": -2.0918078422546387, "logits/rejected": -2.091071605682373, "logps/chosen": -4.644891262054443, "logps/rejected": -6.3071184158325195, "loss": 0.4664, "rewards/accuracies": 1.0, "rewards/chosen": 1.0670639276504517, "rewards/margins": 0.520333468914032, "rewards/rejected": 0.5467304587364197, "step": 6197 }, { "epoch": 3.34, "learning_rate": 6.909523925826993e-09, "logits/chosen": -2.1902482509613037, "logits/rejected": -2.2772908210754395, "logps/chosen": -1.4910862445831299, "logps/rejected": -1.5131046772003174, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.8968454599380493, "rewards/margins": -0.0031748414039611816, "rewards/rejected": 0.9000203013420105, "step": 6198 }, { "epoch": 3.34, "learning_rate": 6.89845118031358e-09, "logits/chosen": -2.134507417678833, "logits/rejected": -2.3553237915039062, "logps/chosen": -9.477066993713379, "logps/rejected": -9.047442436218262, "loss": 0.7236, "rewards/accuracies": 0.0, "rewards/chosen": 0.8101357817649841, "rewards/margins": -0.059914350509643555, "rewards/rejected": 0.8700501322746277, "step": 6199 }, { "epoch": 3.34, "learning_rate": 6.887386656705935e-09, "logits/chosen": -2.055752992630005, "logits/rejected": -2.055574655532837, "logps/chosen": -0.2704990804195404, "logps/rejected": -5.639641284942627, "loss": 0.4461, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427329897880554, "rewards/margins": 0.5759508609771729, "rewards/rejected": 0.36678215861320496, "step": 6200 }, { "epoch": 3.34, "learning_rate": 6.876330357114673e-09, "logits/chosen": -2.1158759593963623, "logits/rejected": -2.2897911071777344, "logps/chosen": -0.4337981939315796, "logps/rejected": -0.5785303711891174, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 1.0468629598617554, "rewards/margins": 0.012178778648376465, "rewards/rejected": 1.034684181213379, "step": 6201 }, { "epoch": 3.35, "learning_rate": 6.865282283648866e-09, "logits/chosen": -2.1369874477386475, "logits/rejected": -2.1416261196136475, "logps/chosen": -10.292038917541504, "logps/rejected": -7.905355930328369, "loss": 0.1754, "rewards/accuracies": 1.0, "rewards/chosen": 2.3714191913604736, "rewards/margins": 1.6516284942626953, "rewards/rejected": 0.7197906374931335, "step": 6202 }, { "epoch": 3.35, "learning_rate": 6.854242438416002e-09, "logits/chosen": -2.069697380065918, "logits/rejected": -2.33941388130188, "logps/chosen": -1.5554475784301758, "logps/rejected": -5.085502624511719, "loss": 0.6316, "rewards/accuracies": 1.0, "rewards/chosen": 1.0435930490493774, "rewards/margins": 0.12715643644332886, "rewards/rejected": 0.9164366126060486, "step": 6203 }, { "epoch": 3.35, "learning_rate": 6.843210823522e-09, "logits/chosen": -2.2184009552001953, "logits/rejected": -2.2204480171203613, "logps/chosen": -3.187317132949829, "logps/rejected": -0.9473810195922852, "loss": 0.5845, "rewards/accuracies": 1.0, "rewards/chosen": 1.1633186340332031, "rewards/margins": 0.23061835765838623, "rewards/rejected": 0.9327002763748169, "step": 6204 }, { "epoch": 3.35, "learning_rate": 6.8321874410712245e-09, "logits/chosen": -2.1755154132843018, "logits/rejected": -2.082423686981201, "logps/chosen": -15.600113868713379, "logps/rejected": -1.4526184797286987, "loss": 0.2566, "rewards/accuracies": 1.0, "rewards/chosen": 2.082881212234497, "rewards/margins": 1.228980302810669, "rewards/rejected": 0.8539008498191833, "step": 6205 }, { "epoch": 3.35, "learning_rate": 6.821172293166444e-09, "logits/chosen": -2.076439380645752, "logits/rejected": -2.0697009563446045, "logps/chosen": -3.4041905403137207, "logps/rejected": -3.9931228160858154, "loss": 0.4709, "rewards/accuracies": 1.0, "rewards/chosen": 0.979156494140625, "rewards/margins": 0.5083729028701782, "rewards/rejected": 0.47078362107276917, "step": 6206 }, { "epoch": 3.35, "learning_rate": 6.8101653819088834e-09, "logits/chosen": -2.2106359004974365, "logits/rejected": -2.157726526260376, "logps/chosen": -14.298678398132324, "logps/rejected": -4.515523433685303, "loss": 0.2205, "rewards/accuracies": 1.0, "rewards/chosen": 1.897958755493164, "rewards/margins": 1.399705410003662, "rewards/rejected": 0.49825340509414673, "step": 6207 }, { "epoch": 3.35, "learning_rate": 6.799166709398174e-09, "logits/chosen": -2.1474077701568604, "logits/rejected": -2.1560490131378174, "logps/chosen": -1.331977128982544, "logps/rejected": -4.493235111236572, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 1.3653219938278198, "rewards/margins": 0.8158907294273376, "rewards/rejected": 0.5494312644004822, "step": 6208 }, { "epoch": 3.35, "learning_rate": 6.7881762777324105e-09, "logits/chosen": -2.068502187728882, "logits/rejected": -2.0538315773010254, "logps/chosen": -13.112035751342773, "logps/rejected": -4.737102508544922, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 1.4709606170654297, "rewards/margins": 0.4309424161911011, "rewards/rejected": 1.0400182008743286, "step": 6209 }, { "epoch": 3.35, "learning_rate": 6.777194089008043e-09, "logits/chosen": -2.220090389251709, "logits/rejected": -2.153022289276123, "logps/chosen": -20.841894149780273, "logps/rejected": -1.8087164163589478, "loss": 0.168, "rewards/accuracies": 1.0, "rewards/chosen": 2.34370493888855, "rewards/margins": 1.698516607284546, "rewards/rejected": 0.6451883912086487, "step": 6210 }, { "epoch": 3.35, "learning_rate": 6.766220145320034e-09, "logits/chosen": -2.245695114135742, "logits/rejected": -2.263862371444702, "logps/chosen": -1.918890357017517, "logps/rejected": -9.064618110656738, "loss": 0.3081, "rewards/accuracies": 1.0, "rewards/chosen": 1.4687780141830444, "rewards/margins": 1.0192129611968994, "rewards/rejected": 0.44956502318382263, "step": 6211 }, { "epoch": 3.35, "learning_rate": 6.755254448761727e-09, "logits/chosen": -2.089599609375, "logits/rejected": -2.0965397357940674, "logps/chosen": -2.9862453937530518, "logps/rejected": -6.630700588226318, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 0.9924155473709106, "rewards/margins": 0.6528847217559814, "rewards/rejected": 0.3395307958126068, "step": 6212 }, { "epoch": 3.35, "learning_rate": 6.744297001424903e-09, "logits/chosen": -1.990368366241455, "logits/rejected": -2.2605748176574707, "logps/chosen": -0.35892006754875183, "logps/rejected": -0.42180439829826355, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 1.0123251676559448, "rewards/margins": 0.023843705654144287, "rewards/rejected": 0.9884814620018005, "step": 6213 }, { "epoch": 3.35, "learning_rate": 6.733347805399764e-09, "logits/chosen": -2.0464792251586914, "logits/rejected": -2.358293294906616, "logps/chosen": -10.387285232543945, "logps/rejected": -6.690404891967773, "loss": 0.613, "rewards/accuracies": 1.0, "rewards/chosen": 1.0433428287506104, "rewards/margins": 0.1673051118850708, "rewards/rejected": 0.8760377168655396, "step": 6214 }, { "epoch": 3.35, "learning_rate": 6.722406862774943e-09, "logits/chosen": -2.012254238128662, "logits/rejected": -2.3251969814300537, "logps/chosen": -0.1283467561006546, "logps/rejected": -0.14650481939315796, "loss": 0.7015, "rewards/accuracies": 0.0, "rewards/chosen": 0.998586118221283, "rewards/margins": -0.016571104526519775, "rewards/rejected": 1.0151572227478027, "step": 6215 }, { "epoch": 3.35, "learning_rate": 6.711474175637494e-09, "logits/chosen": -1.9967129230499268, "logits/rejected": -1.9975069761276245, "logps/chosen": -3.168130874633789, "logps/rejected": -3.7712347507476807, "loss": 0.2454, "rewards/accuracies": 1.0, "rewards/chosen": 1.789886474609375, "rewards/margins": 1.2797906398773193, "rewards/rejected": 0.5100957751274109, "step": 6216 }, { "epoch": 3.35, "learning_rate": 6.700549746072903e-09, "logits/chosen": -2.1743276119232178, "logits/rejected": -2.3316893577575684, "logps/chosen": -2.7491955757141113, "logps/rejected": -2.514035701751709, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.5588891506195068, "rewards/margins": 0.015805065631866455, "rewards/rejected": 0.5430840849876404, "step": 6217 }, { "epoch": 3.35, "learning_rate": 6.689633576165082e-09, "logits/chosen": -2.0315377712249756, "logits/rejected": -2.272825241088867, "logps/chosen": -0.5314555764198303, "logps/rejected": -0.6942019462585449, "loss": 0.6697, "rewards/accuracies": 1.0, "rewards/chosen": 0.8167096376419067, "rewards/margins": 0.047367751598358154, "rewards/rejected": 0.7693418860435486, "step": 6218 }, { "epoch": 3.35, "learning_rate": 6.678725667996349e-09, "logits/chosen": -2.1659438610076904, "logits/rejected": -2.326687812805176, "logps/chosen": -1.556274175643921, "logps/rejected": -0.6090826392173767, "loss": 0.7066, "rewards/accuracies": 0.0, "rewards/chosen": 0.9434114694595337, "rewards/margins": -0.026691019535064697, "rewards/rejected": 0.9701024889945984, "step": 6219 }, { "epoch": 3.35, "learning_rate": 6.667826023647471e-09, "logits/chosen": -2.2118897438049316, "logits/rejected": -2.3774850368499756, "logps/chosen": -1.9497044086456299, "logps/rejected": -2.060788631439209, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.7774896025657654, "rewards/margins": 0.016491174697875977, "rewards/rejected": 0.7609984278678894, "step": 6220 }, { "epoch": 3.36, "learning_rate": 6.656934645197626e-09, "logits/chosen": -2.0093724727630615, "logits/rejected": -2.008610248565674, "logps/chosen": -0.5908567905426025, "logps/rejected": -3.2208266258239746, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 1.166775107383728, "rewards/margins": 0.4265182614326477, "rewards/rejected": 0.7402568459510803, "step": 6221 }, { "epoch": 3.36, "learning_rate": 6.646051534724417e-09, "logits/chosen": -2.0037314891815186, "logits/rejected": -2.0090038776397705, "logps/chosen": -0.6861709356307983, "logps/rejected": -4.833371162414551, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 1.0940178632736206, "rewards/margins": 0.5605467557907104, "rewards/rejected": 0.5334711074829102, "step": 6222 }, { "epoch": 3.36, "learning_rate": 6.635176694303863e-09, "logits/chosen": -2.0784385204315186, "logits/rejected": -2.119969606399536, "logps/chosen": -2.6576128005981445, "logps/rejected": -9.478160858154297, "loss": 0.3419, "rewards/accuracies": 1.0, "rewards/chosen": 1.574126958847046, "rewards/margins": 0.897308886051178, "rewards/rejected": 0.6768180727958679, "step": 6223 }, { "epoch": 3.36, "learning_rate": 6.624310126010419e-09, "logits/chosen": -2.1455812454223633, "logits/rejected": -2.1569371223449707, "logps/chosen": -1.837071180343628, "logps/rejected": -3.5981781482696533, "loss": 0.4454, "rewards/accuracies": 1.0, "rewards/chosen": 1.4807384014129639, "rewards/margins": 0.5778816342353821, "rewards/rejected": 0.9028567671775818, "step": 6224 }, { "epoch": 3.36, "learning_rate": 6.6134518319169495e-09, "logits/chosen": -2.1444101333618164, "logits/rejected": -2.140174388885498, "logps/chosen": -3.489001512527466, "logps/rejected": -5.893210411071777, "loss": 0.3028, "rewards/accuracies": 1.0, "rewards/chosen": 1.4898314476013184, "rewards/margins": 1.0392863750457764, "rewards/rejected": 0.45054513216018677, "step": 6225 }, { "epoch": 3.36, "learning_rate": 6.602601814094749e-09, "logits/chosen": -2.0763015747070312, "logits/rejected": -2.297306537628174, "logps/chosen": -0.3004712462425232, "logps/rejected": -0.23560306429862976, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.9808148741722107, "rewards/margins": 0.034182965755462646, "rewards/rejected": 0.946631908416748, "step": 6226 }, { "epoch": 3.36, "learning_rate": 6.591760074613528e-09, "logits/chosen": -2.1280012130737305, "logits/rejected": -2.1202552318573, "logps/chosen": -1.531381607055664, "logps/rejected": -4.231698036193848, "loss": 0.4672, "rewards/accuracies": 1.0, "rewards/chosen": 1.3546901941299438, "rewards/margins": 0.5184130072593689, "rewards/rejected": 0.836277186870575, "step": 6227 }, { "epoch": 3.36, "learning_rate": 6.580926615541427e-09, "logits/chosen": -2.0204317569732666, "logits/rejected": -2.303786277770996, "logps/chosen": -0.09171567112207413, "logps/rejected": -0.09008871018886566, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.9370488524436951, "rewards/margins": -0.0017752647399902344, "rewards/rejected": 0.9388241171836853, "step": 6228 }, { "epoch": 3.36, "learning_rate": 6.570101438944986e-09, "logits/chosen": -2.223550796508789, "logits/rejected": -2.2206358909606934, "logps/chosen": -0.4474908113479614, "logps/rejected": -8.48508358001709, "loss": 0.3313, "rewards/accuracies": 1.0, "rewards/chosen": 1.0964021682739258, "rewards/margins": 0.9343942403793335, "rewards/rejected": 0.1620079129934311, "step": 6229 }, { "epoch": 3.36, "learning_rate": 6.559284546889194e-09, "logits/chosen": -2.0846898555755615, "logits/rejected": -2.3058533668518066, "logps/chosen": -1.0407251119613647, "logps/rejected": -0.9262104630470276, "loss": 0.7052, "rewards/accuracies": 0.0, "rewards/chosen": 0.9559219479560852, "rewards/margins": -0.024003684520721436, "rewards/rejected": 0.9799256324768066, "step": 6230 }, { "epoch": 3.36, "learning_rate": 6.548475941437437e-09, "logits/chosen": -2.105492115020752, "logits/rejected": -2.1076738834381104, "logps/chosen": -1.0629630088806152, "logps/rejected": -6.136890411376953, "loss": 0.3962, "rewards/accuracies": 1.0, "rewards/chosen": 1.0433523654937744, "rewards/margins": 0.7212636470794678, "rewards/rejected": 0.32208871841430664, "step": 6231 }, { "epoch": 3.36, "learning_rate": 6.537675624651529e-09, "logits/chosen": -2.071303606033325, "logits/rejected": -2.2790396213531494, "logps/chosen": -1.6872645616531372, "logps/rejected": -1.0311731100082397, "loss": 0.722, "rewards/accuracies": 0.0, "rewards/chosen": 0.6805238127708435, "rewards/margins": -0.05695611238479614, "rewards/rejected": 0.7374799251556396, "step": 6232 }, { "epoch": 3.36, "learning_rate": 6.526883598591692e-09, "logits/chosen": -2.0753209590911865, "logits/rejected": -2.3121886253356934, "logps/chosen": -0.9029712080955505, "logps/rejected": -0.981442391872406, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 1.0883179903030396, "rewards/margins": 0.0195387601852417, "rewards/rejected": 1.0687792301177979, "step": 6233 }, { "epoch": 3.36, "learning_rate": 6.516099865316599e-09, "logits/chosen": -2.000627040863037, "logits/rejected": -2.295715808868408, "logps/chosen": -2.1543660163879395, "logps/rejected": -4.765741348266602, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": 0.8558835983276367, "rewards/margins": 0.18041282892227173, "rewards/rejected": 0.675470769405365, "step": 6234 }, { "epoch": 3.36, "learning_rate": 6.5053244268833014e-09, "logits/chosen": -2.160813331604004, "logits/rejected": -2.303244113922119, "logps/chosen": -1.2404861450195312, "logps/rejected": -3.0226693153381348, "loss": 0.7526, "rewards/accuracies": 0.0, "rewards/chosen": 0.8831462264060974, "rewards/margins": -0.11553406715393066, "rewards/rejected": 0.9986802935600281, "step": 6235 }, { "epoch": 3.36, "learning_rate": 6.494557285347296e-09, "logits/chosen": -2.1295089721679688, "logits/rejected": -2.339282989501953, "logps/chosen": -4.3282470703125, "logps/rejected": -3.289257287979126, "loss": 0.5868, "rewards/accuracies": 1.0, "rewards/chosen": 1.142237901687622, "rewards/margins": 0.22528481483459473, "rewards/rejected": 0.9169530868530273, "step": 6236 }, { "epoch": 3.36, "learning_rate": 6.483798442762478e-09, "logits/chosen": -2.1684117317199707, "logits/rejected": -2.2877299785614014, "logps/chosen": -0.8461556434631348, "logps/rejected": -0.8205978870391846, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": 0.717430830001831, "rewards/margins": -0.006251215934753418, "rewards/rejected": 0.7236820459365845, "step": 6237 }, { "epoch": 3.36, "learning_rate": 6.473047901181183e-09, "logits/chosen": -2.0178182125091553, "logits/rejected": -2.0188581943511963, "logps/chosen": -3.4244449138641357, "logps/rejected": -3.1881766319274902, "loss": 0.3689, "rewards/accuracies": 1.0, "rewards/chosen": 1.4738141298294067, "rewards/margins": 0.8070512413978577, "rewards/rejected": 0.6667628884315491, "step": 6238 }, { "epoch": 3.37, "learning_rate": 6.462305662654121e-09, "logits/chosen": -2.0553054809570312, "logits/rejected": -2.0622987747192383, "logps/chosen": -1.3007526397705078, "logps/rejected": -4.137569427490234, "loss": 0.437, "rewards/accuracies": 1.0, "rewards/chosen": 1.050972819328308, "rewards/margins": 0.6013944149017334, "rewards/rejected": 0.4495783746242523, "step": 6239 }, { "epoch": 3.37, "learning_rate": 6.451571729230465e-09, "logits/chosen": -2.2058980464935303, "logits/rejected": -2.1678473949432373, "logps/chosen": -16.107269287109375, "logps/rejected": -3.590975761413574, "loss": 0.2372, "rewards/accuracies": 1.0, "rewards/chosen": 1.8427757024765015, "rewards/margins": 1.3178033828735352, "rewards/rejected": 0.5249722599983215, "step": 6240 }, { "epoch": 3.37, "learning_rate": 6.4408461029577755e-09, "logits/chosen": -2.077446222305298, "logits/rejected": -2.0802810192108154, "logps/chosen": -1.033257246017456, "logps/rejected": -4.915383815765381, "loss": 0.532, "rewards/accuracies": 1.0, "rewards/chosen": 0.9142734408378601, "rewards/margins": 0.35342007875442505, "rewards/rejected": 0.5608533620834351, "step": 6241 }, { "epoch": 3.37, "learning_rate": 6.430128785882039e-09, "logits/chosen": -2.272839307785034, "logits/rejected": -2.2218446731567383, "logps/chosen": -31.257015228271484, "logps/rejected": -12.022601127624512, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 2.7840726375579834, "rewards/margins": 2.0657575130462646, "rewards/rejected": 0.718315064907074, "step": 6242 }, { "epoch": 3.37, "learning_rate": 6.419419780047658e-09, "logits/chosen": -2.1480753421783447, "logits/rejected": -2.4111509323120117, "logps/chosen": -0.48320624232292175, "logps/rejected": -0.4641609787940979, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.7573812007904053, "rewards/margins": 0.02838236093521118, "rewards/rejected": 0.7289988398551941, "step": 6243 }, { "epoch": 3.37, "learning_rate": 6.4087190874974276e-09, "logits/chosen": -2.173820734024048, "logits/rejected": -2.2000033855438232, "logps/chosen": -0.6316789388656616, "logps/rejected": -8.27304458618164, "loss": 0.4544, "rewards/accuracies": 1.0, "rewards/chosen": 1.1215245723724365, "rewards/margins": 0.5530164241790771, "rewards/rejected": 0.5685081481933594, "step": 6244 }, { "epoch": 3.37, "learning_rate": 6.398026710272608e-09, "logits/chosen": -2.1696910858154297, "logits/rejected": -2.1705448627471924, "logps/chosen": -0.5799987316131592, "logps/rejected": -5.334983825683594, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": 1.1085695028305054, "rewards/margins": 0.8614007234573364, "rewards/rejected": 0.24716877937316895, "step": 6245 }, { "epoch": 3.37, "learning_rate": 6.387342650412825e-09, "logits/chosen": -2.1492199897766113, "logits/rejected": -2.0455572605133057, "logps/chosen": -22.824359893798828, "logps/rejected": -3.2910022735595703, "loss": 0.1655, "rewards/accuracies": 1.0, "rewards/chosen": 2.308068037033081, "rewards/margins": 1.7151646614074707, "rewards/rejected": 0.5929033160209656, "step": 6246 }, { "epoch": 3.37, "learning_rate": 6.37666690995613e-09, "logits/chosen": -2.1999826431274414, "logits/rejected": -2.200563430786133, "logps/chosen": -0.22285795211791992, "logps/rejected": -5.666507720947266, "loss": 0.3981, "rewards/accuracies": 1.0, "rewards/chosen": 0.9482092261314392, "rewards/margins": 0.7152764797210693, "rewards/rejected": 0.23293276131153107, "step": 6247 }, { "epoch": 3.37, "learning_rate": 6.3659994909389945e-09, "logits/chosen": -2.0359385013580322, "logits/rejected": -2.241766929626465, "logps/chosen": -0.4685435891151428, "logps/rejected": -0.4541681110858917, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 0.9565957188606262, "rewards/margins": 0.047892868518829346, "rewards/rejected": 0.9087028503417969, "step": 6248 }, { "epoch": 3.37, "learning_rate": 6.355340395396302e-09, "logits/chosen": -2.052574396133423, "logits/rejected": -2.339146852493286, "logps/chosen": -3.6669111251831055, "logps/rejected": -3.620419502258301, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.6127728819847107, "rewards/margins": 0.02264547348022461, "rewards/rejected": 0.5901274085044861, "step": 6249 }, { "epoch": 3.37, "learning_rate": 6.344689625361338e-09, "logits/chosen": -2.071284770965576, "logits/rejected": -2.2923474311828613, "logps/chosen": -5.131992340087891, "logps/rejected": -0.6660812497138977, "loss": 0.706, "rewards/accuracies": 0.0, "rewards/chosen": 0.9304631352424622, "rewards/margins": -0.025573909282684326, "rewards/rejected": 0.9560370445251465, "step": 6250 }, { "epoch": 3.37, "learning_rate": 6.334047182865815e-09, "logits/chosen": -2.0088462829589844, "logits/rejected": -2.0079739093780518, "logps/chosen": -1.6637907028198242, "logps/rejected": -8.892695426940918, "loss": 0.2704, "rewards/accuracies": 1.0, "rewards/chosen": 1.5214316844940186, "rewards/margins": 1.169742226600647, "rewards/rejected": 0.3516894280910492, "step": 6251 }, { "epoch": 3.37, "learning_rate": 6.323413069939848e-09, "logits/chosen": -2.0903170108795166, "logits/rejected": -2.1061623096466064, "logps/chosen": -0.8948714137077332, "logps/rejected": -3.3345746994018555, "loss": 0.3767, "rewards/accuracies": 1.0, "rewards/chosen": 1.5967402458190918, "rewards/margins": 0.7821523547172546, "rewards/rejected": 0.8145878911018372, "step": 6252 }, { "epoch": 3.37, "learning_rate": 6.312787288611965e-09, "logits/chosen": -2.0910305976867676, "logits/rejected": -2.0916967391967773, "logps/chosen": -0.366446316242218, "logps/rejected": -4.626853942871094, "loss": 0.4487, "rewards/accuracies": 1.0, "rewards/chosen": 0.9915930032730103, "rewards/margins": 0.5686275959014893, "rewards/rejected": 0.4229654371738434, "step": 6253 }, { "epoch": 3.37, "learning_rate": 6.302169840909099e-09, "logits/chosen": -2.031888961791992, "logits/rejected": -2.039412260055542, "logps/chosen": -1.3911856412887573, "logps/rejected": -3.347005844116211, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 1.0898288488388062, "rewards/margins": 0.5517950654029846, "rewards/rejected": 0.5380337834358215, "step": 6254 }, { "epoch": 3.37, "learning_rate": 6.2915607288565985e-09, "logits/chosen": -2.1781845092773438, "logits/rejected": -2.3132009506225586, "logps/chosen": -1.3616102933883667, "logps/rejected": -1.1427007913589478, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9785317778587341, "rewards/margins": 0.01235276460647583, "rewards/rejected": 0.9661790132522583, "step": 6255 }, { "epoch": 3.37, "learning_rate": 6.280959954478232e-09, "logits/chosen": -2.1986873149871826, "logits/rejected": -2.318737506866455, "logps/chosen": -0.2447526901960373, "logps/rejected": -0.26133859157562256, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8404146432876587, "rewards/margins": 0.01841890811920166, "rewards/rejected": 0.821995735168457, "step": 6256 }, { "epoch": 3.37, "learning_rate": 6.270367519796155e-09, "logits/chosen": -2.096813678741455, "logits/rejected": -2.100548505783081, "logps/chosen": -1.0865639448165894, "logps/rejected": -10.709148406982422, "loss": 0.4414, "rewards/accuracies": 1.0, "rewards/chosen": 1.1342614889144897, "rewards/margins": 0.5889901518821716, "rewards/rejected": 0.5452713370323181, "step": 6257 }, { "epoch": 3.38, "learning_rate": 6.2597834268309555e-09, "logits/chosen": -1.9732345342636108, "logits/rejected": -2.2883760929107666, "logps/chosen": -0.7584567666053772, "logps/rejected": -0.8759075403213501, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9606592059135437, "rewards/margins": 0.018541395664215088, "rewards/rejected": 0.9421178102493286, "step": 6258 }, { "epoch": 3.38, "learning_rate": 6.249207677601609e-09, "logits/chosen": -2.0447299480438232, "logits/rejected": -2.044480800628662, "logps/chosen": -0.17500324547290802, "logps/rejected": -3.046781301498413, "loss": 0.517, "rewards/accuracies": 1.0, "rewards/chosen": 0.8923062682151794, "rewards/margins": 0.3899931311607361, "rewards/rejected": 0.5023131370544434, "step": 6259 }, { "epoch": 3.38, "learning_rate": 6.2386402741255174e-09, "logits/chosen": -2.165703773498535, "logits/rejected": -2.1479344367980957, "logps/chosen": -9.11242961883545, "logps/rejected": -3.896653890609741, "loss": 0.4669, "rewards/accuracies": 1.0, "rewards/chosen": 1.1463340520858765, "rewards/margins": 0.5191647410392761, "rewards/rejected": 0.6271693110466003, "step": 6260 }, { "epoch": 3.38, "learning_rate": 6.228081218418474e-09, "logits/chosen": -2.313243865966797, "logits/rejected": -2.271885633468628, "logps/chosen": -7.087179660797119, "logps/rejected": -7.160407543182373, "loss": 0.6732, "rewards/accuracies": 1.0, "rewards/chosen": 0.5215165019035339, "rewards/margins": 0.04034265875816345, "rewards/rejected": 0.4811738431453705, "step": 6261 }, { "epoch": 3.38, "learning_rate": 6.217530512494701e-09, "logits/chosen": -2.1242122650146484, "logits/rejected": -2.1225903034210205, "logps/chosen": -1.0108628273010254, "logps/rejected": -2.484445571899414, "loss": 0.6096, "rewards/accuracies": 1.0, "rewards/chosen": 1.1736996173858643, "rewards/margins": 0.1747499704360962, "rewards/rejected": 0.9989496469497681, "step": 6262 }, { "epoch": 3.38, "learning_rate": 6.206988158366805e-09, "logits/chosen": -2.160184144973755, "logits/rejected": -2.158857583999634, "logps/chosen": -1.72922682762146, "logps/rejected": -4.220555305480957, "loss": 0.2663, "rewards/accuracies": 1.0, "rewards/chosen": 1.7280969619750977, "rewards/margins": 1.1868681907653809, "rewards/rejected": 0.5412287712097168, "step": 6263 }, { "epoch": 3.38, "learning_rate": 6.196454158045816e-09, "logits/chosen": -2.1413073539733887, "logits/rejected": -2.3635082244873047, "logps/chosen": -1.2639920711517334, "logps/rejected": -0.9251630306243896, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.9925926327705383, "rewards/margins": 0.01542520523071289, "rewards/rejected": 0.9771674275398254, "step": 6264 }, { "epoch": 3.38, "learning_rate": 6.185928513541161e-09, "logits/chosen": -2.0557024478912354, "logits/rejected": -2.0478148460388184, "logps/chosen": -5.040435791015625, "logps/rejected": -2.084646701812744, "loss": 0.3422, "rewards/accuracies": 1.0, "rewards/chosen": 1.7365907430648804, "rewards/margins": 0.8964189291000366, "rewards/rejected": 0.8401718139648438, "step": 6265 }, { "epoch": 3.38, "learning_rate": 6.1754112268606665e-09, "logits/chosen": -2.244126558303833, "logits/rejected": -2.088685989379883, "logps/chosen": -28.304588317871094, "logps/rejected": -11.901335716247559, "loss": 0.1131, "rewards/accuracies": 1.0, "rewards/chosen": 2.5643277168273926, "rewards/margins": 2.122610569000244, "rewards/rejected": 0.4417172372341156, "step": 6266 }, { "epoch": 3.38, "learning_rate": 6.164902300010594e-09, "logits/chosen": -2.044139862060547, "logits/rejected": -2.0355327129364014, "logps/chosen": -25.27355194091797, "logps/rejected": -8.896589279174805, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": 2.2884814739227295, "rewards/margins": 1.7801686525344849, "rewards/rejected": 0.5083128213882446, "step": 6267 }, { "epoch": 3.38, "learning_rate": 6.154401734995595e-09, "logits/chosen": -2.138801097869873, "logits/rejected": -2.135035276412964, "logps/chosen": -5.513866901397705, "logps/rejected": -5.188477993011475, "loss": 0.3923, "rewards/accuracies": 1.0, "rewards/chosen": 1.628268837928772, "rewards/margins": 0.7330873608589172, "rewards/rejected": 0.8951814770698547, "step": 6268 }, { "epoch": 3.38, "learning_rate": 6.143909533818703e-09, "logits/chosen": -2.044419527053833, "logits/rejected": -2.012265682220459, "logps/chosen": -4.906351089477539, "logps/rejected": -4.144732475280762, "loss": 0.2941, "rewards/accuracies": 1.0, "rewards/chosen": 1.511000633239746, "rewards/margins": 1.0733288526535034, "rewards/rejected": 0.4376717507839203, "step": 6269 }, { "epoch": 3.38, "learning_rate": 6.133425698481376e-09, "logits/chosen": -2.128422737121582, "logits/rejected": -2.1231324672698975, "logps/chosen": -2.6640937328338623, "logps/rejected": -2.4080450534820557, "loss": 0.3234, "rewards/accuracies": 1.0, "rewards/chosen": 1.5844024419784546, "rewards/margins": 0.9628515839576721, "rewards/rejected": 0.6215508580207825, "step": 6270 }, { "epoch": 3.38, "learning_rate": 6.122950230983475e-09, "logits/chosen": -2.0187418460845947, "logits/rejected": -2.2404425144195557, "logps/chosen": -1.5614510774612427, "logps/rejected": -1.6580599546432495, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.6547455787658691, "rewards/margins": 0.03355967998504639, "rewards/rejected": 0.6211858987808228, "step": 6271 }, { "epoch": 3.38, "learning_rate": 6.112483133323276e-09, "logits/chosen": -2.0105273723602295, "logits/rejected": -2.270987033843994, "logps/chosen": -3.8485467433929443, "logps/rejected": -3.711848735809326, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.909537136554718, "rewards/margins": 0.016027450561523438, "rewards/rejected": 0.8935096859931946, "step": 6272 }, { "epoch": 3.38, "learning_rate": 6.102024407497441e-09, "logits/chosen": -2.1052308082580566, "logits/rejected": -2.1045889854431152, "logps/chosen": -0.3399776220321655, "logps/rejected": -5.218770503997803, "loss": 0.4085, "rewards/accuracies": 1.0, "rewards/chosen": 1.0232986211776733, "rewards/margins": 0.6840527653694153, "rewards/rejected": 0.33924585580825806, "step": 6273 }, { "epoch": 3.38, "learning_rate": 6.091574055501042e-09, "logits/chosen": -2.0872385501861572, "logits/rejected": -2.281432628631592, "logps/chosen": -0.3404080271720886, "logps/rejected": -0.35500437021255493, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.9078438878059387, "rewards/margins": 0.006299138069152832, "rewards/rejected": 0.9015447497367859, "step": 6274 }, { "epoch": 3.38, "learning_rate": 6.081132079327545e-09, "logits/chosen": -2.012054443359375, "logits/rejected": -2.2550411224365234, "logps/chosen": -0.6623152494430542, "logps/rejected": -0.6427015066146851, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.9035307168960571, "rewards/margins": -0.0030770301818847656, "rewards/rejected": 0.9066077470779419, "step": 6275 }, { "epoch": 3.39, "learning_rate": 6.070698480968839e-09, "logits/chosen": -2.066638946533203, "logits/rejected": -2.0704050064086914, "logps/chosen": -0.9774681329727173, "logps/rejected": -4.243585109710693, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 1.2465656995773315, "rewards/margins": 0.530669629573822, "rewards/rejected": 0.7158960700035095, "step": 6276 }, { "epoch": 3.39, "learning_rate": 6.060273262415194e-09, "logits/chosen": -2.054711103439331, "logits/rejected": -2.274859666824341, "logps/chosen": -0.6005021929740906, "logps/rejected": -0.6413456201553345, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.933463990688324, "rewards/margins": 0.0344235897064209, "rewards/rejected": 0.8990404009819031, "step": 6277 }, { "epoch": 3.39, "learning_rate": 6.049856425655281e-09, "logits/chosen": -2.2251667976379395, "logits/rejected": -2.1885111331939697, "logps/chosen": -25.951175689697266, "logps/rejected": -1.9332349300384521, "loss": 0.3085, "rewards/accuracies": 1.0, "rewards/chosen": 2.0453720092773438, "rewards/margins": 1.017765760421753, "rewards/rejected": 1.0276062488555908, "step": 6278 }, { "epoch": 3.39, "learning_rate": 6.039447972676204e-09, "logits/chosen": -2.1703732013702393, "logits/rejected": -2.257230281829834, "logps/chosen": -0.6134955883026123, "logps/rejected": -0.5718221068382263, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.9019195437431335, "rewards/margins": 0.013113260269165039, "rewards/rejected": 0.8888062834739685, "step": 6279 }, { "epoch": 3.39, "learning_rate": 6.029047905463425e-09, "logits/chosen": -2.0438685417175293, "logits/rejected": -2.079061985015869, "logps/chosen": -1.0693310499191284, "logps/rejected": -10.82710075378418, "loss": 0.2654, "rewards/accuracies": 1.0, "rewards/chosen": 1.5658410787582397, "rewards/margins": 1.1908977031707764, "rewards/rejected": 0.374943345785141, "step": 6280 }, { "epoch": 3.39, "learning_rate": 6.018656226000835e-09, "logits/chosen": -2.1159963607788086, "logits/rejected": -2.074136972427368, "logps/chosen": -5.165596961975098, "logps/rejected": -3.1468636989593506, "loss": 0.3155, "rewards/accuracies": 1.0, "rewards/chosen": 1.6089104413986206, "rewards/margins": 0.9916431903839111, "rewards/rejected": 0.6172672510147095, "step": 6281 }, { "epoch": 3.39, "learning_rate": 6.008272936270714e-09, "logits/chosen": -2.1346356868743896, "logits/rejected": -2.1335740089416504, "logps/chosen": -4.97840690612793, "logps/rejected": -10.182795524597168, "loss": 0.1974, "rewards/accuracies": 1.0, "rewards/chosen": 1.4443689584732056, "rewards/margins": 1.5223029851913452, "rewards/rejected": -0.07793407887220383, "step": 6282 }, { "epoch": 3.39, "learning_rate": 5.99789803825374e-09, "logits/chosen": -2.2087888717651367, "logits/rejected": -2.279249668121338, "logps/chosen": -2.045280933380127, "logps/rejected": -12.187521934509277, "loss": 0.4186, "rewards/accuracies": 1.0, "rewards/chosen": 1.4771791696548462, "rewards/margins": 0.6542305946350098, "rewards/rejected": 0.8229485750198364, "step": 6283 }, { "epoch": 3.39, "learning_rate": 5.9875315339289965e-09, "logits/chosen": -2.073702812194824, "logits/rejected": -2.074455499649048, "logps/chosen": -2.5850274562835693, "logps/rejected": -5.200652122497559, "loss": 0.2883, "rewards/accuracies": 1.0, "rewards/chosen": 1.572290301322937, "rewards/margins": 1.0961452722549438, "rewards/rejected": 0.4761449992656708, "step": 6284 }, { "epoch": 3.39, "learning_rate": 5.977173425273968e-09, "logits/chosen": -2.0973329544067383, "logits/rejected": -2.272087574005127, "logps/chosen": -0.19595381617546082, "logps/rejected": -0.21001163125038147, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9296323657035828, "rewards/margins": 0.013423502445220947, "rewards/rejected": 0.9162088632583618, "step": 6285 }, { "epoch": 3.39, "learning_rate": 5.9668237142645185e-09, "logits/chosen": -2.1052472591400146, "logits/rejected": -2.292565107345581, "logps/chosen": -0.9122616052627563, "logps/rejected": -1.0895652770996094, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 1.095476508140564, "rewards/margins": 0.014621376991271973, "rewards/rejected": 1.080855131149292, "step": 6286 }, { "epoch": 3.39, "learning_rate": 5.9564824028749295e-09, "logits/chosen": -2.184250593185425, "logits/rejected": -2.069331169128418, "logps/chosen": -25.69620132446289, "logps/rejected": -3.7811665534973145, "loss": 0.1954, "rewards/accuracies": 1.0, "rewards/chosen": 2.0079410076141357, "rewards/margins": 1.533258080482483, "rewards/rejected": 0.4746829569339752, "step": 6287 }, { "epoch": 3.39, "learning_rate": 5.9461494930778785e-09, "logits/chosen": -2.163424491882324, "logits/rejected": -2.1535251140594482, "logps/chosen": -7.407902240753174, "logps/rejected": -1.4267423152923584, "loss": 0.4168, "rewards/accuracies": 1.0, "rewards/chosen": 1.423535943031311, "rewards/margins": 0.6595053672790527, "rewards/rejected": 0.7640305757522583, "step": 6288 }, { "epoch": 3.39, "learning_rate": 5.935824986844423e-09, "logits/chosen": -2.1148505210876465, "logits/rejected": -2.29081130027771, "logps/chosen": -1.039109230041504, "logps/rejected": -9.484784126281738, "loss": 0.6444, "rewards/accuracies": 1.0, "rewards/chosen": 1.0608839988708496, "rewards/margins": 0.09999388456344604, "rewards/rejected": 0.9608901143074036, "step": 6289 }, { "epoch": 3.39, "learning_rate": 5.925508886144054e-09, "logits/chosen": -2.0565335750579834, "logits/rejected": -2.309577465057373, "logps/chosen": -0.3149225413799286, "logps/rejected": -0.21945339441299438, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.9401516914367676, "rewards/margins": 0.02760601043701172, "rewards/rejected": 0.9125456809997559, "step": 6290 }, { "epoch": 3.39, "learning_rate": 5.915201192944624e-09, "logits/chosen": -2.1053431034088135, "logits/rejected": -2.254927158355713, "logps/chosen": -0.3680252432823181, "logps/rejected": -2.8995227813720703, "loss": 0.625, "rewards/accuracies": 1.0, "rewards/chosen": 0.9886972308158875, "rewards/margins": 0.14122295379638672, "rewards/rejected": 0.8474742770195007, "step": 6291 }, { "epoch": 3.39, "learning_rate": 5.90490190921239e-09, "logits/chosen": -2.0553719997406006, "logits/rejected": -2.0620126724243164, "logps/chosen": -1.2947102785110474, "logps/rejected": -5.682570457458496, "loss": 0.4305, "rewards/accuracies": 1.0, "rewards/chosen": 1.1728273630142212, "rewards/margins": 0.6198009848594666, "rewards/rejected": 0.5530263781547546, "step": 6292 }, { "epoch": 3.39, "learning_rate": 5.8946110369120176e-09, "logits/chosen": -2.1965551376342773, "logits/rejected": -2.1833741664886475, "logps/chosen": -2.9596829414367676, "logps/rejected": -11.287337303161621, "loss": 0.5167, "rewards/accuracies": 1.0, "rewards/chosen": 0.9012670516967773, "rewards/margins": 0.3908013105392456, "rewards/rejected": 0.5104657411575317, "step": 6293 }, { "epoch": 3.39, "learning_rate": 5.884328578006548e-09, "logits/chosen": -2.073040008544922, "logits/rejected": -2.2959275245666504, "logps/chosen": -0.6782735586166382, "logps/rejected": -0.665633499622345, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.9532781839370728, "rewards/margins": 0.009790897369384766, "rewards/rejected": 0.943487286567688, "step": 6294 }, { "epoch": 3.4, "learning_rate": 5.87405453445744e-09, "logits/chosen": -2.1403071880340576, "logits/rejected": -2.150923252105713, "logps/chosen": -1.927811622619629, "logps/rejected": -1.931033968925476, "loss": 0.5116, "rewards/accuracies": 1.0, "rewards/chosen": 1.4118331670761108, "rewards/margins": 0.40351057052612305, "rewards/rejected": 1.0083225965499878, "step": 6295 }, { "epoch": 3.4, "learning_rate": 5.863788908224526e-09, "logits/chosen": -2.213479518890381, "logits/rejected": -2.2172622680664062, "logps/chosen": -0.18039439618587494, "logps/rejected": -5.884512901306152, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": 0.9390100836753845, "rewards/margins": 0.6945081949234009, "rewards/rejected": 0.24450187385082245, "step": 6296 }, { "epoch": 3.4, "learning_rate": 5.853531701266046e-09, "logits/chosen": -2.0990121364593506, "logits/rejected": -2.086115598678589, "logps/chosen": -0.5227850675582886, "logps/rejected": -6.866210460662842, "loss": 0.3802, "rewards/accuracies": 1.0, "rewards/chosen": 1.1053794622421265, "rewards/margins": 0.7708399891853333, "rewards/rejected": 0.3345394730567932, "step": 6297 }, { "epoch": 3.4, "learning_rate": 5.843282915538627e-09, "logits/chosen": -2.191448926925659, "logits/rejected": -2.197014570236206, "logps/chosen": -0.3368847966194153, "logps/rejected": -7.439171314239502, "loss": 0.3488, "rewards/accuracies": 1.0, "rewards/chosen": 1.0487252473831177, "rewards/margins": 0.8736310005187988, "rewards/rejected": 0.17509427666664124, "step": 6298 }, { "epoch": 3.4, "learning_rate": 5.8330425529973025e-09, "logits/chosen": -1.9746427536010742, "logits/rejected": -1.974094033241272, "logps/chosen": -0.8683112263679504, "logps/rejected": -1.107055902481079, "loss": 0.5906, "rewards/accuracies": 1.0, "rewards/chosen": 0.965336799621582, "rewards/margins": 0.2168484330177307, "rewards/rejected": 0.7484883666038513, "step": 6299 }, { "epoch": 3.4, "learning_rate": 5.822810615595475e-09, "logits/chosen": -2.1451046466827393, "logits/rejected": -2.141686201095581, "logps/chosen": -4.702282905578613, "logps/rejected": -6.225443363189697, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": 1.2647238969802856, "rewards/margins": 1.0173808336257935, "rewards/rejected": 0.2473430186510086, "step": 6300 }, { "epoch": 3.4, "learning_rate": 5.8125871052849665e-09, "logits/chosen": -2.01481294631958, "logits/rejected": -2.027693748474121, "logps/chosen": -1.352813959121704, "logps/rejected": -2.303218364715576, "loss": 0.4475, "rewards/accuracies": 1.0, "rewards/chosen": 1.269376277923584, "rewards/margins": 0.5718729496002197, "rewards/rejected": 0.6975033283233643, "step": 6301 }, { "epoch": 3.4, "learning_rate": 5.802372024015972e-09, "logits/chosen": -2.192206621170044, "logits/rejected": -2.1467387676239014, "logps/chosen": -18.38323974609375, "logps/rejected": -5.668523788452148, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": 1.9978046417236328, "rewards/margins": 1.5982816219329834, "rewards/rejected": 0.3995230793952942, "step": 6302 }, { "epoch": 3.4, "learning_rate": 5.792165373737085e-09, "logits/chosen": -2.120728015899658, "logits/rejected": -2.3609161376953125, "logps/chosen": -1.5760705471038818, "logps/rejected": -1.7530865669250488, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 0.7388492822647095, "rewards/margins": 0.03621089458465576, "rewards/rejected": 0.7026383876800537, "step": 6303 }, { "epoch": 3.4, "learning_rate": 5.781967156395301e-09, "logits/chosen": -2.0043280124664307, "logits/rejected": -2.2834019660949707, "logps/chosen": -4.929418563842773, "logps/rejected": -7.123451232910156, "loss": 0.6625, "rewards/accuracies": 1.0, "rewards/chosen": 0.7800731658935547, "rewards/margins": 0.06229209899902344, "rewards/rejected": 0.7177810668945312, "step": 6304 }, { "epoch": 3.4, "learning_rate": 5.771777373935988e-09, "logits/chosen": -1.984242558479309, "logits/rejected": -1.9849116802215576, "logps/chosen": -0.08600276708602905, "logps/rejected": -8.164619445800781, "loss": 0.4704, "rewards/accuracies": 1.0, "rewards/chosen": 0.8332490921020508, "rewards/margins": 0.5096503496170044, "rewards/rejected": 0.3235987722873688, "step": 6305 }, { "epoch": 3.4, "learning_rate": 5.761596028302918e-09, "logits/chosen": -2.0340051651000977, "logits/rejected": -2.0486974716186523, "logps/chosen": -3.129424810409546, "logps/rejected": -8.104499816894531, "loss": 0.4391, "rewards/accuracies": 1.0, "rewards/chosen": 1.1396321058273315, "rewards/margins": 0.5955509543418884, "rewards/rejected": 0.5440811514854431, "step": 6306 }, { "epoch": 3.4, "learning_rate": 5.751423121438248e-09, "logits/chosen": -2.2010653018951416, "logits/rejected": -2.2022244930267334, "logps/chosen": -0.5264256596565247, "logps/rejected": -2.7376585006713867, "loss": 0.5154, "rewards/accuracies": 1.0, "rewards/chosen": 1.0898817777633667, "rewards/margins": 0.39412713050842285, "rewards/rejected": 0.6957546472549438, "step": 6307 }, { "epoch": 3.4, "learning_rate": 5.741258655282532e-09, "logits/chosen": -2.115617275238037, "logits/rejected": -1.9963229894638062, "logps/chosen": -22.989856719970703, "logps/rejected": -4.353902816772461, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 1.8008015155792236, "rewards/margins": 0.8160243630409241, "rewards/rejected": 0.9847771525382996, "step": 6308 }, { "epoch": 3.4, "learning_rate": 5.731102631774704e-09, "logits/chosen": -2.128262519836426, "logits/rejected": -2.154162645339966, "logps/chosen": -1.0590801239013672, "logps/rejected": -7.574156761169434, "loss": 0.3968, "rewards/accuracies": 1.0, "rewards/chosen": 1.284368872642517, "rewards/margins": 0.7194174528121948, "rewards/rejected": 0.5649514198303223, "step": 6309 }, { "epoch": 3.4, "learning_rate": 5.7209550528521e-09, "logits/chosen": -2.1329376697540283, "logits/rejected": -2.3120834827423096, "logps/chosen": -0.4808286130428314, "logps/rejected": -0.6588046550750732, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": 0.7078346610069275, "rewards/margins": 0.044190406799316406, "rewards/rejected": 0.6636442542076111, "step": 6310 }, { "epoch": 3.4, "learning_rate": 5.710815920450418e-09, "logits/chosen": -2.052241086959839, "logits/rejected": -2.0568668842315674, "logps/chosen": -0.814218282699585, "logps/rejected": -13.54521369934082, "loss": 0.437, "rewards/accuracies": 1.0, "rewards/chosen": 1.092982292175293, "rewards/margins": 0.6013860702514648, "rewards/rejected": 0.4915962219238281, "step": 6311 }, { "epoch": 3.4, "learning_rate": 5.700685236503788e-09, "logits/chosen": -1.9892237186431885, "logits/rejected": -1.9918771982192993, "logps/chosen": -0.11664675176143646, "logps/rejected": -8.901984214782715, "loss": 0.3996, "rewards/accuracies": 1.0, "rewards/chosen": 0.841222882270813, "rewards/margins": 0.7107019424438477, "rewards/rejected": 0.13052092492580414, "step": 6312 }, { "epoch": 3.41, "learning_rate": 5.690563002944704e-09, "logits/chosen": -2.0690500736236572, "logits/rejected": -2.0752782821655273, "logps/chosen": -0.3475497364997864, "logps/rejected": -8.914379119873047, "loss": 0.3473, "rewards/accuracies": 1.0, "rewards/chosen": 1.0335829257965088, "rewards/margins": 0.8787732720375061, "rewards/rejected": 0.15480966866016388, "step": 6313 }, { "epoch": 3.41, "learning_rate": 5.680449221704037e-09, "logits/chosen": -2.2170329093933105, "logits/rejected": -2.3763208389282227, "logps/chosen": -0.7519486546516418, "logps/rejected": -0.8298592567443848, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9708275198936462, "rewards/margins": 0.015191316604614258, "rewards/rejected": 0.955636203289032, "step": 6314 }, { "epoch": 3.41, "learning_rate": 5.670343894711072e-09, "logits/chosen": -2.1444008350372314, "logits/rejected": -2.138885498046875, "logps/chosen": -4.177011966705322, "logps/rejected": -7.4658379554748535, "loss": 0.5759, "rewards/accuracies": 1.0, "rewards/chosen": 0.9715448617935181, "rewards/margins": 0.2500235438346863, "rewards/rejected": 0.7215213179588318, "step": 6315 }, { "epoch": 3.41, "learning_rate": 5.660247023893444e-09, "logits/chosen": -2.1064705848693848, "logits/rejected": -2.235887289047241, "logps/chosen": -0.12799958884716034, "logps/rejected": -0.12235801666975021, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.9115786552429199, "rewards/margins": 0.04213017225265503, "rewards/rejected": 0.8694484829902649, "step": 6316 }, { "epoch": 3.41, "learning_rate": 5.6501586111772195e-09, "logits/chosen": -2.0814597606658936, "logits/rejected": -2.2436296939849854, "logps/chosen": -3.271463394165039, "logps/rejected": -2.7132339477539062, "loss": 0.7352, "rewards/accuracies": 0.0, "rewards/chosen": 0.9168128967285156, "rewards/margins": -0.08246535062789917, "rewards/rejected": 0.9992782473564148, "step": 6317 }, { "epoch": 3.41, "learning_rate": 5.640078658486819e-09, "logits/chosen": -2.2029271125793457, "logits/rejected": -2.2007691860198975, "logps/chosen": -2.0957095623016357, "logps/rejected": -4.129372596740723, "loss": 0.4877, "rewards/accuracies": 1.0, "rewards/chosen": 1.1204273700714111, "rewards/margins": 0.4642217755317688, "rewards/rejected": 0.6562055945396423, "step": 6318 }, { "epoch": 3.41, "learning_rate": 5.63000716774506e-09, "logits/chosen": -2.266366720199585, "logits/rejected": -2.144033432006836, "logps/chosen": -31.247941970825195, "logps/rejected": -5.487783908843994, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 2.8944132328033447, "rewards/margins": 2.193542003631592, "rewards/rejected": 0.7008712291717529, "step": 6319 }, { "epoch": 3.41, "learning_rate": 5.619944140873151e-09, "logits/chosen": -2.2270419597625732, "logits/rejected": -2.134131669998169, "logps/chosen": -17.930744171142578, "logps/rejected": -1.944800853729248, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": 2.5902099609375, "rewards/margins": 1.8995544910430908, "rewards/rejected": 0.6906554102897644, "step": 6320 }, { "epoch": 3.41, "learning_rate": 5.609889579790678e-09, "logits/chosen": -2.213869571685791, "logits/rejected": -2.2914345264434814, "logps/chosen": -3.7254693508148193, "logps/rejected": -24.858837127685547, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 1.1502825021743774, "rewards/margins": 0.6112424731254578, "rewards/rejected": 0.5390400290489197, "step": 6321 }, { "epoch": 3.41, "learning_rate": 5.599843486415606e-09, "logits/chosen": -2.140667676925659, "logits/rejected": -2.0324904918670654, "logps/chosen": -17.069578170776367, "logps/rejected": -5.2495927810668945, "loss": 0.3012, "rewards/accuracies": 1.0, "rewards/chosen": 1.6949423551559448, "rewards/margins": 1.045792579650879, "rewards/rejected": 0.6491497159004211, "step": 6322 }, { "epoch": 3.41, "learning_rate": 5.5898058626643156e-09, "logits/chosen": -2.020725965499878, "logits/rejected": -2.230675458908081, "logps/chosen": -1.2656714916229248, "logps/rejected": -1.368523120880127, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.6478668451309204, "rewards/margins": 0.011543095111846924, "rewards/rejected": 0.6363237500190735, "step": 6323 }, { "epoch": 3.41, "learning_rate": 5.579776710451539e-09, "logits/chosen": -2.0392770767211914, "logits/rejected": -2.0422937870025635, "logps/chosen": -5.2136945724487305, "logps/rejected": -11.963282585144043, "loss": 0.3642, "rewards/accuracies": 1.0, "rewards/chosen": 1.7544220685958862, "rewards/margins": 0.8225444555282593, "rewards/rejected": 0.931877613067627, "step": 6324 }, { "epoch": 3.41, "learning_rate": 5.569756031690398e-09, "logits/chosen": -2.0913376808166504, "logits/rejected": -2.0982258319854736, "logps/chosen": -1.5263880491256714, "logps/rejected": -4.069843292236328, "loss": 0.4032, "rewards/accuracies": 1.0, "rewards/chosen": 1.1491469144821167, "rewards/margins": 0.6999540328979492, "rewards/rejected": 0.44919291138648987, "step": 6325 }, { "epoch": 3.41, "learning_rate": 5.5597438282924126e-09, "logits/chosen": -2.2183589935302734, "logits/rejected": -2.373769521713257, "logps/chosen": -0.9144113063812256, "logps/rejected": -1.7324484586715698, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 1.0293434858322144, "rewards/margins": 0.04271739721298218, "rewards/rejected": 0.9866260886192322, "step": 6326 }, { "epoch": 3.41, "learning_rate": 5.549740102167472e-09, "logits/chosen": -2.1009023189544678, "logits/rejected": -2.3177907466888428, "logps/chosen": -0.25371137261390686, "logps/rejected": -0.22334450483322144, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.7658822536468506, "rewards/margins": 0.02458113431930542, "rewards/rejected": 0.7413011193275452, "step": 6327 }, { "epoch": 3.41, "learning_rate": 5.539744855223849e-09, "logits/chosen": -2.0713534355163574, "logits/rejected": -2.0748519897460938, "logps/chosen": -0.34796619415283203, "logps/rejected": -5.033166408538818, "loss": 0.4975, "rewards/accuracies": 1.0, "rewards/chosen": 0.8519362807273865, "rewards/margins": 0.43920162320137024, "rewards/rejected": 0.41273465752601624, "step": 6328 }, { "epoch": 3.41, "learning_rate": 5.529758089368214e-09, "logits/chosen": -2.0270838737487793, "logits/rejected": -2.264725923538208, "logps/chosen": -1.6904469728469849, "logps/rejected": -1.2552862167358398, "loss": 0.708, "rewards/accuracies": 0.0, "rewards/chosen": 0.8698288202285767, "rewards/margins": -0.029466450214385986, "rewards/rejected": 0.8992952704429626, "step": 6329 }, { "epoch": 3.41, "learning_rate": 5.519779806505598e-09, "logits/chosen": -2.0111052989959717, "logits/rejected": -2.0247042179107666, "logps/chosen": -0.6132708787918091, "logps/rejected": -11.234015464782715, "loss": 0.5126, "rewards/accuracies": 1.0, "rewards/chosen": 1.0486794710159302, "rewards/margins": 0.4010341763496399, "rewards/rejected": 0.6476452946662903, "step": 6330 }, { "epoch": 3.41, "learning_rate": 5.509810008539434e-09, "logits/chosen": -2.1944241523742676, "logits/rejected": -2.3862457275390625, "logps/chosen": -8.076146125793457, "logps/rejected": -11.240704536437988, "loss": 0.779, "rewards/accuracies": 0.0, "rewards/chosen": 1.2022404670715332, "rewards/margins": -0.16490232944488525, "rewards/rejected": 1.3671427965164185, "step": 6331 }, { "epoch": 3.42, "learning_rate": 5.499848697371529e-09, "logits/chosen": -2.1199679374694824, "logits/rejected": -2.2711305618286133, "logps/chosen": -0.748849630355835, "logps/rejected": -0.7174683213233948, "loss": 0.7037, "rewards/accuracies": 0.0, "rewards/chosen": 0.9125208854675293, "rewards/margins": -0.020938873291015625, "rewards/rejected": 0.9334597587585449, "step": 6332 }, { "epoch": 3.42, "learning_rate": 5.48989587490205e-09, "logits/chosen": -2.1425859928131104, "logits/rejected": -2.2720940113067627, "logps/chosen": -0.33880680799484253, "logps/rejected": -0.3505905866622925, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8374356627464294, "rewards/margins": 0.012187302112579346, "rewards/rejected": 0.8252483606338501, "step": 6333 }, { "epoch": 3.42, "learning_rate": 5.479951543029565e-09, "logits/chosen": -2.174865484237671, "logits/rejected": -2.367107629776001, "logps/chosen": -0.8176720142364502, "logps/rejected": -0.8517686724662781, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 0.8835798501968384, "rewards/margins": -0.013376891613006592, "rewards/rejected": 0.896956741809845, "step": 6334 }, { "epoch": 3.42, "learning_rate": 5.470015703651043e-09, "logits/chosen": -2.037600040435791, "logits/rejected": -2.300877809524536, "logps/chosen": -0.17377528548240662, "logps/rejected": -0.2337723970413208, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 1.0522332191467285, "rewards/margins": 0.009041190147399902, "rewards/rejected": 1.0431920289993286, "step": 6335 }, { "epoch": 3.42, "learning_rate": 5.460088358661802e-09, "logits/chosen": -2.0081300735473633, "logits/rejected": -2.007161855697632, "logps/chosen": -4.369757652282715, "logps/rejected": -0.5003901720046997, "loss": 0.7176, "rewards/accuracies": 0.0, "rewards/chosen": 0.8997778296470642, "rewards/margins": -0.04837846755981445, "rewards/rejected": 0.9481562972068787, "step": 6336 }, { "epoch": 3.42, "learning_rate": 5.450169509955549e-09, "logits/chosen": -2.134373188018799, "logits/rejected": -2.13511323928833, "logps/chosen": -2.2150847911834717, "logps/rejected": -1.8849378824234009, "loss": 0.502, "rewards/accuracies": 1.0, "rewards/chosen": 1.3814395666122437, "rewards/margins": 0.427703320980072, "rewards/rejected": 0.9537362456321716, "step": 6337 }, { "epoch": 3.42, "learning_rate": 5.44025915942436e-09, "logits/chosen": -1.9990129470825195, "logits/rejected": -1.986246943473816, "logps/chosen": -8.481094360351562, "logps/rejected": -9.633502960205078, "loss": 0.4913, "rewards/accuracies": 1.0, "rewards/chosen": 1.6018909215927124, "rewards/margins": 0.4551200866699219, "rewards/rejected": 1.1467708349227905, "step": 6338 }, { "epoch": 3.42, "learning_rate": 5.43035730895871e-09, "logits/chosen": -2.0582754611968994, "logits/rejected": -2.2825891971588135, "logps/chosen": -8.92458724975586, "logps/rejected": -7.8548784255981445, "loss": 0.7576, "rewards/accuracies": 0.0, "rewards/chosen": 0.799368679523468, "rewards/margins": -0.12495297193527222, "rewards/rejected": 0.9243216514587402, "step": 6339 }, { "epoch": 3.42, "learning_rate": 5.420463960447446e-09, "logits/chosen": -2.0523056983947754, "logits/rejected": -2.0513498783111572, "logps/chosen": -3.1779141426086426, "logps/rejected": -3.2821171283721924, "loss": 0.4223, "rewards/accuracies": 1.0, "rewards/chosen": 1.4060767889022827, "rewards/margins": 0.6434590220451355, "rewards/rejected": 0.7626177668571472, "step": 6340 }, { "epoch": 3.42, "learning_rate": 5.410579115777781e-09, "logits/chosen": -2.1532180309295654, "logits/rejected": -2.215533494949341, "logps/chosen": -5.88885498046875, "logps/rejected": -16.25663948059082, "loss": 0.5306, "rewards/accuracies": 1.0, "rewards/chosen": 1.614633560180664, "rewards/margins": 0.3566586971282959, "rewards/rejected": 1.2579748630523682, "step": 6341 }, { "epoch": 3.42, "learning_rate": 5.400702776835314e-09, "logits/chosen": -2.1798698902130127, "logits/rejected": -2.3175950050354004, "logps/chosen": -1.3531908988952637, "logps/rejected": -1.0740846395492554, "loss": 0.7472, "rewards/accuracies": 0.0, "rewards/chosen": 0.9440008401870728, "rewards/margins": -0.10538458824157715, "rewards/rejected": 1.04938542842865, "step": 6342 }, { "epoch": 3.42, "learning_rate": 5.390834945504031e-09, "logits/chosen": -1.9821202754974365, "logits/rejected": -2.274038076400757, "logps/chosen": -0.4421096444129944, "logps/rejected": -3.2850422859191895, "loss": 0.5588, "rewards/accuracies": 1.0, "rewards/chosen": 1.0603125095367432, "rewards/margins": 0.28967732191085815, "rewards/rejected": 0.770635187625885, "step": 6343 }, { "epoch": 3.42, "learning_rate": 5.380975623666278e-09, "logits/chosen": -2.0149714946746826, "logits/rejected": -2.2721238136291504, "logps/chosen": -0.30397701263427734, "logps/rejected": -0.41013091802597046, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 1.0123625993728638, "rewards/margins": 0.023738563060760498, "rewards/rejected": 0.9886240363121033, "step": 6344 }, { "epoch": 3.42, "learning_rate": 5.3711248132027876e-09, "logits/chosen": -2.073756456375122, "logits/rejected": -2.0799150466918945, "logps/chosen": -5.034262180328369, "logps/rejected": -1.2561688423156738, "loss": 0.4228, "rewards/accuracies": 1.0, "rewards/chosen": 1.4159435033798218, "rewards/margins": 0.6419087648391724, "rewards/rejected": 0.7740347385406494, "step": 6345 }, { "epoch": 3.42, "learning_rate": 5.361282515992665e-09, "logits/chosen": -1.9782739877700806, "logits/rejected": -2.291563034057617, "logps/chosen": -4.145296096801758, "logps/rejected": -3.6670258045196533, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 0.8330454230308533, "rewards/margins": -0.017973780632019043, "rewards/rejected": 0.8510192036628723, "step": 6346 }, { "epoch": 3.42, "learning_rate": 5.351448733913405e-09, "logits/chosen": -2.199894905090332, "logits/rejected": -2.287815570831299, "logps/chosen": -0.46611636877059937, "logps/rejected": -0.438775897026062, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 1.0731089115142822, "rewards/margins": 0.018375396728515625, "rewards/rejected": 1.0547335147857666, "step": 6347 }, { "epoch": 3.42, "learning_rate": 5.34162346884085e-09, "logits/chosen": -2.059429168701172, "logits/rejected": -2.287454605102539, "logps/chosen": -2.0334689617156982, "logps/rejected": -2.0365538597106934, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.8892423510551453, "rewards/margins": -0.005147218704223633, "rewards/rejected": 0.8943895697593689, "step": 6348 }, { "epoch": 3.42, "learning_rate": 5.331806722649251e-09, "logits/chosen": -2.0237059593200684, "logits/rejected": -2.027118682861328, "logps/chosen": -2.3609559535980225, "logps/rejected": -3.379272937774658, "loss": 0.4582, "rewards/accuracies": 1.0, "rewards/chosen": 1.449873924255371, "rewards/margins": 0.5426197052001953, "rewards/rejected": 0.9072542190551758, "step": 6349 }, { "epoch": 3.43, "learning_rate": 5.321998497211205e-09, "logits/chosen": -2.101393222808838, "logits/rejected": -2.3291876316070557, "logps/chosen": -1.1899285316467285, "logps/rejected": -1.3672454357147217, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9945951700210571, "rewards/margins": 0.005589783191680908, "rewards/rejected": 0.9890053868293762, "step": 6350 }, { "epoch": 3.43, "learning_rate": 5.312198794397699e-09, "logits/chosen": -2.1860415935516357, "logits/rejected": -2.192047595977783, "logps/chosen": -1.8767483234405518, "logps/rejected": -4.016104221343994, "loss": 0.4828, "rewards/accuracies": 1.0, "rewards/chosen": 0.9484665989875793, "rewards/margins": 0.4771578311920166, "rewards/rejected": 0.47130876779556274, "step": 6351 }, { "epoch": 3.43, "learning_rate": 5.302407616078092e-09, "logits/chosen": -2.0485942363739014, "logits/rejected": -2.0643022060394287, "logps/chosen": -1.4594740867614746, "logps/rejected": -11.364175796508789, "loss": 0.51, "rewards/accuracies": 1.0, "rewards/chosen": 1.2257782220840454, "rewards/margins": 0.407509446144104, "rewards/rejected": 0.8182687759399414, "step": 6352 }, { "epoch": 3.43, "learning_rate": 5.292624964120112e-09, "logits/chosen": -2.0227363109588623, "logits/rejected": -2.0154199600219727, "logps/chosen": -3.0815207958221436, "logps/rejected": -6.373857498168945, "loss": 0.2747, "rewards/accuracies": 1.0, "rewards/chosen": 1.4317786693572998, "rewards/margins": 1.1517633199691772, "rewards/rejected": 0.28001537919044495, "step": 6353 }, { "epoch": 3.43, "learning_rate": 5.282850840389874e-09, "logits/chosen": -1.994368553161621, "logits/rejected": -1.992372989654541, "logps/chosen": -9.023138999938965, "logps/rejected": -2.038790702819824, "loss": 0.4398, "rewards/accuracies": 1.0, "rewards/chosen": 1.3928160667419434, "rewards/margins": 0.5934985280036926, "rewards/rejected": 0.7993175387382507, "step": 6354 }, { "epoch": 3.43, "learning_rate": 5.273085246751851e-09, "logits/chosen": -2.081220865249634, "logits/rejected": -2.0796942710876465, "logps/chosen": -0.22397926449775696, "logps/rejected": -6.0234150886535645, "loss": 0.4882, "rewards/accuracies": 1.0, "rewards/chosen": 1.0542491674423218, "rewards/margins": 0.463101863861084, "rewards/rejected": 0.5911473035812378, "step": 6355 }, { "epoch": 3.43, "learning_rate": 5.263328185068888e-09, "logits/chosen": -2.0672218799591064, "logits/rejected": -2.3646304607391357, "logps/chosen": -0.09435438364744186, "logps/rejected": -0.09314343333244324, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.7383329272270203, "rewards/margins": 0.010959267616271973, "rewards/rejected": 0.7273736596107483, "step": 6356 }, { "epoch": 3.43, "learning_rate": 5.253579657202223e-09, "logits/chosen": -2.0560696125030518, "logits/rejected": -2.2618324756622314, "logps/chosen": -0.7501660585403442, "logps/rejected": -0.7416316270828247, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 1.0422000885009766, "rewards/margins": 0.011864542961120605, "rewards/rejected": 1.030335545539856, "step": 6357 }, { "epoch": 3.43, "learning_rate": 5.243839665011446e-09, "logits/chosen": -2.222224235534668, "logits/rejected": -2.3534038066864014, "logps/chosen": -4.425352573394775, "logps/rejected": -0.7214575409889221, "loss": 0.8433, "rewards/accuracies": 0.0, "rewards/chosen": 0.8876226544380188, "rewards/margins": -0.28067976236343384, "rewards/rejected": 1.1683024168014526, "step": 6358 }, { "epoch": 3.43, "learning_rate": 5.234108210354527e-09, "logits/chosen": -2.039747953414917, "logits/rejected": -2.2780344486236572, "logps/chosen": -0.2534753382205963, "logps/rejected": -0.2923767566680908, "loss": 0.7028, "rewards/accuracies": 0.0, "rewards/chosen": 0.9702973365783691, "rewards/margins": -0.01917898654937744, "rewards/rejected": 0.9894763231277466, "step": 6359 }, { "epoch": 3.43, "learning_rate": 5.224385295087797e-09, "logits/chosen": -2.055734157562256, "logits/rejected": -2.22422194480896, "logps/chosen": -0.323452353477478, "logps/rejected": -0.3046795427799225, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 1.0194141864776611, "rewards/margins": 0.0008400678634643555, "rewards/rejected": 1.0185741186141968, "step": 6360 }, { "epoch": 3.43, "learning_rate": 5.214670921065989e-09, "logits/chosen": -2.1119606494903564, "logits/rejected": -2.31697416305542, "logps/chosen": -12.958946228027344, "logps/rejected": -5.975139141082764, "loss": 0.8787, "rewards/accuracies": 0.0, "rewards/chosen": 0.6274738311767578, "rewards/margins": -0.3420414328575134, "rewards/rejected": 0.9695152640342712, "step": 6361 }, { "epoch": 3.43, "learning_rate": 5.204965090142149e-09, "logits/chosen": -2.0184319019317627, "logits/rejected": -2.280775547027588, "logps/chosen": -1.0509291887283325, "logps/rejected": -1.0832080841064453, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.8530070185661316, "rewards/margins": 0.009448707103729248, "rewards/rejected": 0.8435583114624023, "step": 6362 }, { "epoch": 3.43, "learning_rate": 5.1952678041677525e-09, "logits/chosen": -2.172823905944824, "logits/rejected": -2.315443992614746, "logps/chosen": -0.611396849155426, "logps/rejected": -0.5795185565948486, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.9278406500816345, "rewards/margins": -0.0041866302490234375, "rewards/rejected": 0.932027280330658, "step": 6363 }, { "epoch": 3.43, "learning_rate": 5.185579064992618e-09, "logits/chosen": -2.227386236190796, "logits/rejected": -2.2284305095672607, "logps/chosen": -0.15004834532737732, "logps/rejected": -5.59105110168457, "loss": 0.3956, "rewards/accuracies": 1.0, "rewards/chosen": 1.0573548078536987, "rewards/margins": 0.7229669094085693, "rewards/rejected": 0.334387868642807, "step": 6364 }, { "epoch": 3.43, "learning_rate": 5.175898874464929e-09, "logits/chosen": -1.985829472541809, "logits/rejected": -2.2801568508148193, "logps/chosen": -2.972679615020752, "logps/rejected": -2.8361823558807373, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 1.119604468345642, "rewards/margins": 0.02054774761199951, "rewards/rejected": 1.0990567207336426, "step": 6365 }, { "epoch": 3.43, "learning_rate": 5.1662272344312526e-09, "logits/chosen": -2.101228952407837, "logits/rejected": -2.112273693084717, "logps/chosen": -5.338949680328369, "logps/rejected": -4.881007194519043, "loss": 0.3429, "rewards/accuracies": 1.0, "rewards/chosen": 1.825635552406311, "rewards/margins": 0.8939511179924011, "rewards/rejected": 0.9316844344139099, "step": 6366 }, { "epoch": 3.43, "learning_rate": 5.156564146736508e-09, "logits/chosen": -2.0833301544189453, "logits/rejected": -2.292262077331543, "logps/chosen": -3.3580832481384277, "logps/rejected": -2.8248932361602783, "loss": 0.6604, "rewards/accuracies": 1.0, "rewards/chosen": 0.9274494051933289, "rewards/margins": 0.0665353536605835, "rewards/rejected": 0.8609140515327454, "step": 6367 }, { "epoch": 3.43, "learning_rate": 5.1469096132240155e-09, "logits/chosen": -1.975056767463684, "logits/rejected": -1.9822362661361694, "logps/chosen": -3.167506217956543, "logps/rejected": -5.632070541381836, "loss": 0.4112, "rewards/accuracies": 1.0, "rewards/chosen": 1.1760891675949097, "rewards/margins": 0.6761072874069214, "rewards/rejected": 0.4999818801879883, "step": 6368 }, { "epoch": 3.44, "learning_rate": 5.137263635735423e-09, "logits/chosen": -1.985079050064087, "logits/rejected": -2.2428996562957764, "logps/chosen": -0.3210008144378662, "logps/rejected": -0.33694568276405334, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9675607681274414, "rewards/margins": 0.016436100006103516, "rewards/rejected": 0.9511246681213379, "step": 6369 }, { "epoch": 3.44, "learning_rate": 5.12762621611077e-09, "logits/chosen": -2.087932586669922, "logits/rejected": -2.2670156955718994, "logps/chosen": -3.4328205585479736, "logps/rejected": -2.029283285140991, "loss": 0.7764, "rewards/accuracies": 0.0, "rewards/chosen": 0.9880587458610535, "rewards/margins": -0.16011470556259155, "rewards/rejected": 1.148173451423645, "step": 6370 }, { "epoch": 3.44, "learning_rate": 5.117997356188452e-09, "logits/chosen": -2.0554633140563965, "logits/rejected": -2.2761056423187256, "logps/chosen": -0.6450552344322205, "logps/rejected": -0.6499527096748352, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8270285725593567, "rewards/margins": 0.020744144916534424, "rewards/rejected": 0.8062844276428223, "step": 6371 }, { "epoch": 3.44, "learning_rate": 5.108377057805252e-09, "logits/chosen": -2.0068650245666504, "logits/rejected": -2.266176700592041, "logps/chosen": -0.3377489149570465, "logps/rejected": -0.34993958473205566, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.8889105916023254, "rewards/margins": -0.0005143880844116211, "rewards/rejected": 0.8894249796867371, "step": 6372 }, { "epoch": 3.44, "learning_rate": 5.098765322796289e-09, "logits/chosen": -2.1368162631988525, "logits/rejected": -2.28895902633667, "logps/chosen": -4.844359874725342, "logps/rejected": -6.207622051239014, "loss": 0.5632, "rewards/accuracies": 1.0, "rewards/chosen": 1.0878208875656128, "rewards/margins": 0.2794477939605713, "rewards/rejected": 0.8083730936050415, "step": 6373 }, { "epoch": 3.44, "learning_rate": 5.089162152995074e-09, "logits/chosen": -2.0586304664611816, "logits/rejected": -2.055314302444458, "logps/chosen": -2.1581664085388184, "logps/rejected": -5.964362621307373, "loss": 0.3863, "rewards/accuracies": 1.0, "rewards/chosen": 1.2567081451416016, "rewards/margins": 0.7518734335899353, "rewards/rejected": 0.5048347115516663, "step": 6374 }, { "epoch": 3.44, "learning_rate": 5.079567550233477e-09, "logits/chosen": -2.0670769214630127, "logits/rejected": -2.2492835521698, "logps/chosen": -1.088775634765625, "logps/rejected": -1.1302915811538696, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.8485469818115234, "rewards/margins": 0.003206789493560791, "rewards/rejected": 0.8453401923179626, "step": 6375 }, { "epoch": 3.44, "learning_rate": 5.0699815163417256e-09, "logits/chosen": -2.0883638858795166, "logits/rejected": -2.3067727088928223, "logps/chosen": -8.778963088989258, "logps/rejected": -4.635895252227783, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 1.1596364974975586, "rewards/margins": 0.03160417079925537, "rewards/rejected": 1.1280323266983032, "step": 6376 }, { "epoch": 3.44, "learning_rate": 5.060404053148426e-09, "logits/chosen": -2.0343286991119385, "logits/rejected": -2.0342822074890137, "logps/chosen": -1.345223069190979, "logps/rejected": -3.4054906368255615, "loss": 0.5056, "rewards/accuracies": 1.0, "rewards/chosen": 0.9447757601737976, "rewards/margins": 0.41861093044281006, "rewards/rejected": 0.5261648297309875, "step": 6377 }, { "epoch": 3.44, "learning_rate": 5.050835162480549e-09, "logits/chosen": -2.223903179168701, "logits/rejected": -2.2295029163360596, "logps/chosen": -1.5709683895111084, "logps/rejected": -5.0439133644104, "loss": 0.4334, "rewards/accuracies": 1.0, "rewards/chosen": 0.9987117052078247, "rewards/margins": 0.6116563081741333, "rewards/rejected": 0.387055367231369, "step": 6378 }, { "epoch": 3.44, "learning_rate": 5.04127484616339e-09, "logits/chosen": -2.1542880535125732, "logits/rejected": -2.3419411182403564, "logps/chosen": -1.8167431354522705, "logps/rejected": -1.661643624305725, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.8760771751403809, "rewards/margins": 0.0003743171691894531, "rewards/rejected": 0.8757028579711914, "step": 6379 }, { "epoch": 3.44, "learning_rate": 5.03172310602068e-09, "logits/chosen": -2.0964980125427246, "logits/rejected": -2.0846097469329834, "logps/chosen": -0.7690902948379517, "logps/rejected": -11.361906051635742, "loss": 0.5148, "rewards/accuracies": 1.0, "rewards/chosen": 1.0475624799728394, "rewards/margins": 0.39549511671066284, "rewards/rejected": 0.6520673632621765, "step": 6380 }, { "epoch": 3.44, "learning_rate": 5.022179943874461e-09, "logits/chosen": -2.1725614070892334, "logits/rejected": -2.2523770332336426, "logps/chosen": -4.663516998291016, "logps/rejected": -3.4842138290405273, "loss": 0.6047, "rewards/accuracies": 1.0, "rewards/chosen": 0.7678184509277344, "rewards/margins": 0.18539434671401978, "rewards/rejected": 0.5824241042137146, "step": 6381 }, { "epoch": 3.44, "learning_rate": 5.012645361545159e-09, "logits/chosen": -2.0787851810455322, "logits/rejected": -2.2645015716552734, "logps/chosen": -1.412315011024475, "logps/rejected": -1.319997787475586, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.7002076506614685, "rewards/margins": 0.00850212574005127, "rewards/rejected": 0.6917055249214172, "step": 6382 }, { "epoch": 3.44, "learning_rate": 5.003119360851554e-09, "logits/chosen": -2.1720569133758545, "logits/rejected": -2.1751530170440674, "logps/chosen": -0.1328139752149582, "logps/rejected": -6.8204026222229, "loss": 0.3991, "rewards/accuracies": 1.0, "rewards/chosen": 0.8644027709960938, "rewards/margins": 0.7123163938522339, "rewards/rejected": 0.15208640694618225, "step": 6383 }, { "epoch": 3.44, "learning_rate": 4.993601943610798e-09, "logits/chosen": -2.069873571395874, "logits/rejected": -2.0630950927734375, "logps/chosen": -3.6860804557800293, "logps/rejected": -3.4910995960235596, "loss": 0.2868, "rewards/accuracies": 1.0, "rewards/chosen": 1.6023372411727905, "rewards/margins": 1.102299690246582, "rewards/rejected": 0.5000374913215637, "step": 6384 }, { "epoch": 3.44, "learning_rate": 4.984093111638399e-09, "logits/chosen": -2.1821417808532715, "logits/rejected": -2.3286547660827637, "logps/chosen": -3.3297154903411865, "logps/rejected": -2.8389251232147217, "loss": 0.7129, "rewards/accuracies": 0.0, "rewards/chosen": 0.9722582697868347, "rewards/margins": -0.03917950391769409, "rewards/rejected": 1.0114377737045288, "step": 6385 }, { "epoch": 3.44, "learning_rate": 4.9745928667482286e-09, "logits/chosen": -2.091019868850708, "logits/rejected": -2.0891289710998535, "logps/chosen": -0.5141929388046265, "logps/rejected": -5.863644123077393, "loss": 0.4011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9338405728340149, "rewards/margins": 0.7064496278762817, "rewards/rejected": 0.22739091515541077, "step": 6386 }, { "epoch": 3.44, "learning_rate": 4.965101210752526e-09, "logits/chosen": -2.156921625137329, "logits/rejected": -2.139171838760376, "logps/chosen": -6.363539218902588, "logps/rejected": -5.662482261657715, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": 1.4081801176071167, "rewards/margins": 0.891976535320282, "rewards/rejected": 0.5162035822868347, "step": 6387 }, { "epoch": 3.45, "learning_rate": 4.955618145461882e-09, "logits/chosen": -2.15674090385437, "logits/rejected": -2.3902511596679688, "logps/chosen": -8.367341995239258, "logps/rejected": -12.29849624633789, "loss": 0.7115, "rewards/accuracies": 0.0, "rewards/chosen": 1.1206095218658447, "rewards/margins": -0.036370277404785156, "rewards/rejected": 1.1569797992706299, "step": 6388 }, { "epoch": 3.45, "learning_rate": 4.946143672685255e-09, "logits/chosen": -2.182621717453003, "logits/rejected": -2.159274101257324, "logps/chosen": -8.418042182922363, "logps/rejected": -7.7685089111328125, "loss": 0.2427, "rewards/accuracies": 1.0, "rewards/chosen": 1.839211344718933, "rewards/margins": 1.2923076152801514, "rewards/rejected": 0.5469037890434265, "step": 6389 }, { "epoch": 3.45, "learning_rate": 4.936677794229954e-09, "logits/chosen": -2.1318647861480713, "logits/rejected": -2.1577513217926025, "logps/chosen": -6.277517318725586, "logps/rejected": -1.461704969406128, "loss": 0.3737, "rewards/accuracies": 1.0, "rewards/chosen": 1.7265815734863281, "rewards/margins": 0.7917494773864746, "rewards/rejected": 0.9348320960998535, "step": 6390 }, { "epoch": 3.45, "learning_rate": 4.927220511901692e-09, "logits/chosen": -2.228581428527832, "logits/rejected": -2.2330520153045654, "logps/chosen": -0.4242663085460663, "logps/rejected": -4.685995578765869, "loss": 0.4201, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957114458084106, "rewards/margins": 0.6499624252319336, "rewards/rejected": 0.34574905037879944, "step": 6391 }, { "epoch": 3.45, "learning_rate": 4.917771827504474e-09, "logits/chosen": -1.9434092044830322, "logits/rejected": -1.9794588088989258, "logps/chosen": -0.5166016221046448, "logps/rejected": -13.26644515991211, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": 1.091373085975647, "rewards/margins": 0.3697195053100586, "rewards/rejected": 0.7216535806655884, "step": 6392 }, { "epoch": 3.45, "learning_rate": 4.90833174284071e-09, "logits/chosen": -2.056272506713867, "logits/rejected": -2.0593461990356445, "logps/chosen": -1.3537359237670898, "logps/rejected": -1.5019166469573975, "loss": 0.5757, "rewards/accuracies": 1.0, "rewards/chosen": 0.9882357716560364, "rewards/margins": 0.25046074390411377, "rewards/rejected": 0.7377750277519226, "step": 6393 }, { "epoch": 3.45, "learning_rate": 4.8989002597111585e-09, "logits/chosen": -2.0235321521759033, "logits/rejected": -2.0233049392700195, "logps/chosen": -0.4623042941093445, "logps/rejected": -4.2852678298950195, "loss": 0.4578, "rewards/accuracies": 1.0, "rewards/chosen": 1.0498303174972534, "rewards/margins": 0.5437564253807068, "rewards/rejected": 0.5060738921165466, "step": 6394 }, { "epoch": 3.45, "learning_rate": 4.8894773799149355e-09, "logits/chosen": -2.1940107345581055, "logits/rejected": -2.0182416439056396, "logps/chosen": -32.91845703125, "logps/rejected": -4.535397529602051, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 2.508836507797241, "rewards/margins": 1.7894701957702637, "rewards/rejected": 0.7193663716316223, "step": 6395 }, { "epoch": 3.45, "learning_rate": 4.880063105249521e-09, "logits/chosen": -2.0619876384735107, "logits/rejected": -2.244246482849121, "logps/chosen": -0.34467923641204834, "logps/rejected": -0.3800737261772156, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.9851460456848145, "rewards/margins": 0.022595703601837158, "rewards/rejected": 0.9625503420829773, "step": 6396 }, { "epoch": 3.45, "learning_rate": 4.87065743751075e-09, "logits/chosen": -1.9849883317947388, "logits/rejected": -2.291576623916626, "logps/chosen": -0.07532159239053726, "logps/rejected": -0.08433115482330322, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.8390361666679382, "rewards/margins": 0.015789508819580078, "rewards/rejected": 0.8232466578483582, "step": 6397 }, { "epoch": 3.45, "learning_rate": 4.861260378492815e-09, "logits/chosen": -2.0936636924743652, "logits/rejected": -2.0443115234375, "logps/chosen": -25.801212310791016, "logps/rejected": -2.1277942657470703, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": 2.176239490509033, "rewards/margins": 1.38472580909729, "rewards/rejected": 0.7915136218070984, "step": 6398 }, { "epoch": 3.45, "learning_rate": 4.851871929988266e-09, "logits/chosen": -2.0457780361175537, "logits/rejected": -2.0587494373321533, "logps/chosen": -6.616009712219238, "logps/rejected": -3.2449586391448975, "loss": 0.3293, "rewards/accuracies": 1.0, "rewards/chosen": 1.7040008306503296, "rewards/margins": 0.9415401816368103, "rewards/rejected": 0.7624606490135193, "step": 6399 }, { "epoch": 3.45, "learning_rate": 4.842492093788014e-09, "logits/chosen": -2.061483860015869, "logits/rejected": -2.079596757888794, "logps/chosen": -0.8416879177093506, "logps/rejected": -3.119967460632324, "loss": 0.6075, "rewards/accuracies": 1.0, "rewards/chosen": 1.009583830833435, "rewards/margins": 0.17929667234420776, "rewards/rejected": 0.8302871584892273, "step": 6400 }, { "epoch": 3.45, "learning_rate": 4.833120871681312e-09, "logits/chosen": -2.0555014610290527, "logits/rejected": -2.0533785820007324, "logps/chosen": -2.0622706413269043, "logps/rejected": -6.403809547424316, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0306919813156128, "rewards/margins": 0.7514658570289612, "rewards/rejected": 0.2792261242866516, "step": 6401 }, { "epoch": 3.45, "learning_rate": 4.823758265455802e-09, "logits/chosen": -2.1981382369995117, "logits/rejected": -2.1584503650665283, "logps/chosen": -5.691601276397705, "logps/rejected": -15.999160766601562, "loss": 0.3383, "rewards/accuracies": 1.0, "rewards/chosen": 1.279211401939392, "rewards/margins": 0.9100120067596436, "rewards/rejected": 0.36919936537742615, "step": 6402 }, { "epoch": 3.45, "learning_rate": 4.81440427689746e-09, "logits/chosen": -2.0280227661132812, "logits/rejected": -1.9927598237991333, "logps/chosen": -11.825417518615723, "logps/rejected": -2.7368907928466797, "loss": 0.2773, "rewards/accuracies": 1.0, "rewards/chosen": 1.9037525653839111, "rewards/margins": 1.1409146785736084, "rewards/rejected": 0.762837827205658, "step": 6403 }, { "epoch": 3.45, "learning_rate": 4.80505890779061e-09, "logits/chosen": -2.035168170928955, "logits/rejected": -2.0346107482910156, "logps/chosen": -1.820539951324463, "logps/rejected": -4.873623371124268, "loss": 0.3433, "rewards/accuracies": 1.0, "rewards/chosen": 1.5240777730941772, "rewards/margins": 0.892657458782196, "rewards/rejected": 0.6314203143119812, "step": 6404 }, { "epoch": 3.45, "learning_rate": 4.795722159917959e-09, "logits/chosen": -2.1624341011047363, "logits/rejected": -2.159788131713867, "logps/chosen": -5.792495250701904, "logps/rejected": -4.2953715324401855, "loss": 0.3382, "rewards/accuracies": 1.0, "rewards/chosen": 1.4005619287490845, "rewards/margins": 0.9100890755653381, "rewards/rejected": 0.49047285318374634, "step": 6405 }, { "epoch": 3.46, "learning_rate": 4.786394035060537e-09, "logits/chosen": -1.9704227447509766, "logits/rejected": -2.2644758224487305, "logps/chosen": -0.7801094651222229, "logps/rejected": -0.9646336436271667, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.8582262396812439, "rewards/margins": 0.03787851333618164, "rewards/rejected": 0.8203477263450623, "step": 6406 }, { "epoch": 3.46, "learning_rate": 4.7770745349977546e-09, "logits/chosen": -1.9965654611587524, "logits/rejected": -2.255811929702759, "logps/chosen": -12.508829116821289, "logps/rejected": -0.3458610773086548, "loss": 0.4309, "rewards/accuracies": 1.0, "rewards/chosen": 1.461377501487732, "rewards/margins": 0.6185959577560425, "rewards/rejected": 0.8427815437316895, "step": 6407 }, { "epoch": 3.46, "learning_rate": 4.7677636615073736e-09, "logits/chosen": -2.1688766479492188, "logits/rejected": -2.3234503269195557, "logps/chosen": -0.9918184280395508, "logps/rejected": -7.943669319152832, "loss": 0.5422, "rewards/accuracies": 1.0, "rewards/chosen": 1.1721595525741577, "rewards/margins": 0.32878023386001587, "rewards/rejected": 0.8433793187141418, "step": 6408 }, { "epoch": 3.46, "learning_rate": 4.758461416365489e-09, "logits/chosen": -2.2493791580200195, "logits/rejected": -2.0638556480407715, "logps/chosen": -55.55683517456055, "logps/rejected": -0.5208725333213806, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 2.9857258796691895, "rewards/margins": 2.1563258171081543, "rewards/rejected": 0.8294001817703247, "step": 6409 }, { "epoch": 3.46, "learning_rate": 4.749167801346576e-09, "logits/chosen": -1.9342727661132812, "logits/rejected": -2.326932191848755, "logps/chosen": -4.548975944519043, "logps/rejected": -3.865131378173828, "loss": 0.6415, "rewards/accuracies": 1.0, "rewards/chosen": 0.9543835520744324, "rewards/margins": 0.10605037212371826, "rewards/rejected": 0.8483331799507141, "step": 6410 }, { "epoch": 3.46, "learning_rate": 4.739882818223445e-09, "logits/chosen": -2.0575344562530518, "logits/rejected": -2.236759901046753, "logps/chosen": -0.1697293370962143, "logps/rejected": -0.16999056935310364, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.9080665707588196, "rewards/margins": 0.024534106254577637, "rewards/rejected": 0.8835324645042419, "step": 6411 }, { "epoch": 3.46, "learning_rate": 4.730606468767273e-09, "logits/chosen": -2.0723018646240234, "logits/rejected": -2.255300760269165, "logps/chosen": -0.31608596444129944, "logps/rejected": -0.2211376279592514, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.7877028584480286, "rewards/margins": 0.017080485820770264, "rewards/rejected": 0.7706223726272583, "step": 6412 }, { "epoch": 3.46, "learning_rate": 4.721338754747589e-09, "logits/chosen": -2.0923216342926025, "logits/rejected": -2.1185944080352783, "logps/chosen": -21.887889862060547, "logps/rejected": -8.862295150756836, "loss": 0.2774, "rewards/accuracies": 1.0, "rewards/chosen": 1.9712657928466797, "rewards/margins": 1.1401963233947754, "rewards/rejected": 0.8310694098472595, "step": 6413 }, { "epoch": 3.46, "learning_rate": 4.712079677932274e-09, "logits/chosen": -2.049680709838867, "logits/rejected": -2.0407755374908447, "logps/chosen": -6.277795791625977, "logps/rejected": -4.650843143463135, "loss": 0.4777, "rewards/accuracies": 1.0, "rewards/chosen": 1.1169496774673462, "rewards/margins": 0.4904431104660034, "rewards/rejected": 0.6265065670013428, "step": 6414 }, { "epoch": 3.46, "learning_rate": 4.702829240087547e-09, "logits/chosen": -2.1340062618255615, "logits/rejected": -2.301780939102173, "logps/chosen": -0.4323055148124695, "logps/rejected": -0.46746134757995605, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.8971706628799438, "rewards/margins": 0.007437169551849365, "rewards/rejected": 0.8897334933280945, "step": 6415 }, { "epoch": 3.46, "learning_rate": 4.693587442977997e-09, "logits/chosen": -2.1234283447265625, "logits/rejected": -2.0894105434417725, "logps/chosen": -2.363124370574951, "logps/rejected": -4.461413383483887, "loss": 0.3884, "rewards/accuracies": 1.0, "rewards/chosen": 1.2046310901641846, "rewards/margins": 0.7451981902122498, "rewards/rejected": 0.4594328999519348, "step": 6416 }, { "epoch": 3.46, "learning_rate": 4.684354288366555e-09, "logits/chosen": -2.152482748031616, "logits/rejected": -2.0674631595611572, "logps/chosen": -5.362309455871582, "logps/rejected": -2.1752917766571045, "loss": 0.3359, "rewards/accuracies": 1.0, "rewards/chosen": 1.7681468725204468, "rewards/margins": 0.9181872010231018, "rewards/rejected": 0.849959671497345, "step": 6417 }, { "epoch": 3.46, "learning_rate": 4.6751297780145085e-09, "logits/chosen": -2.2164571285247803, "logits/rejected": -2.2751364707946777, "logps/chosen": -7.838911533355713, "logps/rejected": -5.445754528045654, "loss": 0.6616, "rewards/accuracies": 1.0, "rewards/chosen": 0.8889808654785156, "rewards/margins": 0.0641055703163147, "rewards/rejected": 0.8248752951622009, "step": 6418 }, { "epoch": 3.46, "learning_rate": 4.665913913681496e-09, "logits/chosen": -2.0203042030334473, "logits/rejected": -2.2295947074890137, "logps/chosen": -0.30650168657302856, "logps/rejected": -0.26033511757850647, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.9271686673164368, "rewards/margins": 0.005848228931427002, "rewards/rejected": 0.9213204383850098, "step": 6419 }, { "epoch": 3.46, "learning_rate": 4.656706697125495e-09, "logits/chosen": -2.071932077407837, "logits/rejected": -2.0699732303619385, "logps/chosen": -0.32184073328971863, "logps/rejected": -2.542861223220825, "loss": 0.5231, "rewards/accuracies": 1.0, "rewards/chosen": 1.0165549516677856, "rewards/margins": 0.3749554753303528, "rewards/rejected": 0.6415994763374329, "step": 6420 }, { "epoch": 3.46, "learning_rate": 4.647508130102856e-09, "logits/chosen": -2.119044542312622, "logits/rejected": -2.119443893432617, "logps/chosen": -3.5300161838531494, "logps/rejected": -3.4899826049804688, "loss": 0.4312, "rewards/accuracies": 1.0, "rewards/chosen": 1.208264708518982, "rewards/margins": 0.6178404688835144, "rewards/rejected": 0.5904242396354675, "step": 6421 }, { "epoch": 3.46, "learning_rate": 4.638318214368258e-09, "logits/chosen": -2.0716426372528076, "logits/rejected": -2.318852186203003, "logps/chosen": -1.0233964920043945, "logps/rejected": -3.075622081756592, "loss": 0.6054, "rewards/accuracies": 1.0, "rewards/chosen": 1.136272668838501, "rewards/margins": 0.18401950597763062, "rewards/rejected": 0.9522531628608704, "step": 6422 }, { "epoch": 3.46, "learning_rate": 4.629136951674745e-09, "logits/chosen": -2.0475502014160156, "logits/rejected": -2.3122074604034424, "logps/chosen": -3.2724480628967285, "logps/rejected": -0.5386359095573425, "loss": 0.6102, "rewards/accuracies": 1.0, "rewards/chosen": 1.185707688331604, "rewards/margins": 0.17332065105438232, "rewards/rejected": 1.0123870372772217, "step": 6423 }, { "epoch": 3.46, "learning_rate": 4.619964343773691e-09, "logits/chosen": -2.0832808017730713, "logits/rejected": -2.082832098007202, "logps/chosen": -2.111853837966919, "logps/rejected": -4.468227386474609, "loss": 0.3165, "rewards/accuracies": 1.0, "rewards/chosen": 1.4994391202926636, "rewards/margins": 0.9880847334861755, "rewards/rejected": 0.511354386806488, "step": 6424 }, { "epoch": 3.47, "learning_rate": 4.610800392414849e-09, "logits/chosen": -2.05792498588562, "logits/rejected": -2.2826321125030518, "logps/chosen": -0.2728198766708374, "logps/rejected": -0.25935426354408264, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.8846381306648254, "rewards/margins": 0.021924495697021484, "rewards/rejected": 0.862713634967804, "step": 6425 }, { "epoch": 3.47, "learning_rate": 4.601645099346286e-09, "logits/chosen": -2.0804407596588135, "logits/rejected": -2.081954002380371, "logps/chosen": -1.2162166833877563, "logps/rejected": -1.7771861553192139, "loss": 0.4737, "rewards/accuracies": 1.0, "rewards/chosen": 1.361676573753357, "rewards/margins": 0.500906229019165, "rewards/rejected": 0.8607703447341919, "step": 6426 }, { "epoch": 3.47, "learning_rate": 4.592498466314448e-09, "logits/chosen": -2.1174070835113525, "logits/rejected": -2.1289422512054443, "logps/chosen": -0.40701600909233093, "logps/rejected": -6.638676643371582, "loss": 0.555, "rewards/accuracies": 1.0, "rewards/chosen": 1.1136281490325928, "rewards/margins": 0.2985503673553467, "rewards/rejected": 0.8150777816772461, "step": 6427 }, { "epoch": 3.47, "learning_rate": 4.583360495064109e-09, "logits/chosen": -1.9757850170135498, "logits/rejected": -1.9832547903060913, "logps/chosen": -0.33821672201156616, "logps/rejected": -13.134222030639648, "loss": 0.2855, "rewards/accuracies": 1.0, "rewards/chosen": 1.052939772605896, "rewards/margins": 1.1075284481048584, "rewards/rejected": -0.05458870157599449, "step": 6428 }, { "epoch": 3.47, "learning_rate": 4.574231187338401e-09, "logits/chosen": -2.1362149715423584, "logits/rejected": -2.067720890045166, "logps/chosen": -23.1124210357666, "logps/rejected": -2.2427806854248047, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": 2.5532453060150146, "rewards/margins": 1.763992428779602, "rewards/rejected": 0.7892528772354126, "step": 6429 }, { "epoch": 3.47, "learning_rate": 4.565110544878798e-09, "logits/chosen": -2.146299123764038, "logits/rejected": -2.258978843688965, "logps/chosen": -0.8071126341819763, "logps/rejected": -0.7472429275512695, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9752494692802429, "rewards/margins": 0.00371396541595459, "rewards/rejected": 0.9715355038642883, "step": 6430 }, { "epoch": 3.47, "learning_rate": 4.555998569425118e-09, "logits/chosen": -2.157292127609253, "logits/rejected": -2.157041549682617, "logps/chosen": -2.336091995239258, "logps/rejected": -2.417677164077759, "loss": 0.611, "rewards/accuracies": 1.0, "rewards/chosen": 0.9762188196182251, "rewards/margins": 0.17159080505371094, "rewards/rejected": 0.8046280145645142, "step": 6431 }, { "epoch": 3.47, "learning_rate": 4.5468952627155385e-09, "logits/chosen": -2.083359718322754, "logits/rejected": -2.0896456241607666, "logps/chosen": -1.546101689338684, "logps/rejected": -2.9311726093292236, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": 1.4427030086517334, "rewards/margins": 0.7221047878265381, "rewards/rejected": 0.7205982208251953, "step": 6432 }, { "epoch": 3.47, "learning_rate": 4.537800626486576e-09, "logits/chosen": -2.056321382522583, "logits/rejected": -2.0536766052246094, "logps/chosen": -5.679727554321289, "logps/rejected": -2.9019510746002197, "loss": 0.4726, "rewards/accuracies": 1.0, "rewards/chosen": 1.149984359741211, "rewards/margins": 0.5039398074150085, "rewards/rejected": 0.6460445523262024, "step": 6433 }, { "epoch": 3.47, "learning_rate": 4.5287146624730875e-09, "logits/chosen": -2.007615566253662, "logits/rejected": -2.3139259815216064, "logps/chosen": -0.2576812207698822, "logps/rejected": -0.2860531806945801, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 1.0034544467926025, "rewards/margins": 0.01511770486831665, "rewards/rejected": 0.9883367419242859, "step": 6434 }, { "epoch": 3.47, "learning_rate": 4.519637372408275e-09, "logits/chosen": -1.9944477081298828, "logits/rejected": -2.283651828765869, "logps/chosen": -0.8144654035568237, "logps/rejected": -0.9071412086486816, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.866700291633606, "rewards/margins": 0.0054604411125183105, "rewards/rejected": 0.8612398505210876, "step": 6435 }, { "epoch": 3.47, "learning_rate": 4.510568758023709e-09, "logits/chosen": -2.1542952060699463, "logits/rejected": -2.154904365539551, "logps/chosen": -0.16567036509513855, "logps/rejected": -7.280354022979736, "loss": 0.3867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9725346565246582, "rewards/margins": 0.750623345375061, "rewards/rejected": 0.22191129624843597, "step": 6436 }, { "epoch": 3.47, "learning_rate": 4.501508821049271e-09, "logits/chosen": -1.9434586763381958, "logits/rejected": -2.250678300857544, "logps/chosen": -3.1632742881774902, "logps/rejected": -3.0676140785217285, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.794203519821167, "rewards/margins": 0.026903510093688965, "rewards/rejected": 0.767300009727478, "step": 6437 }, { "epoch": 3.47, "learning_rate": 4.492457563213225e-09, "logits/chosen": -2.1210479736328125, "logits/rejected": -2.1175622940063477, "logps/chosen": -5.226964950561523, "logps/rejected": -4.826760292053223, "loss": 0.3668, "rewards/accuracies": 1.0, "rewards/chosen": 1.3470138311386108, "rewards/margins": 0.8139311671257019, "rewards/rejected": 0.5330826640129089, "step": 6438 }, { "epoch": 3.47, "learning_rate": 4.48341498624214e-09, "logits/chosen": -2.219712257385254, "logits/rejected": -2.251253604888916, "logps/chosen": -5.165108680725098, "logps/rejected": -22.696027755737305, "loss": 0.7977, "rewards/accuracies": 0.0, "rewards/chosen": 1.1230109930038452, "rewards/margins": -0.19925522804260254, "rewards/rejected": 1.3222662210464478, "step": 6439 }, { "epoch": 3.47, "learning_rate": 4.4743810918609456e-09, "logits/chosen": -2.091003179550171, "logits/rejected": -2.337416172027588, "logps/chosen": -0.5378662347793579, "logps/rejected": -0.5417981147766113, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 1.162118911743164, "rewards/margins": 0.013943314552307129, "rewards/rejected": 1.148175597190857, "step": 6440 }, { "epoch": 3.47, "learning_rate": 4.465355881792926e-09, "logits/chosen": -2.2087392807006836, "logits/rejected": -2.380096912384033, "logps/chosen": -3.6003847122192383, "logps/rejected": -1.13828706741333, "loss": 0.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.7350448966026306, "rewards/margins": -0.1529943346977234, "rewards/rejected": 0.888039231300354, "step": 6441 }, { "epoch": 3.47, "learning_rate": 4.456339357759692e-09, "logits/chosen": -2.087003231048584, "logits/rejected": -2.245917797088623, "logps/chosen": -0.8366731405258179, "logps/rejected": -1.2868545055389404, "loss": 0.7276, "rewards/accuracies": 0.0, "rewards/chosen": 0.8803892135620117, "rewards/margins": -0.06777364015579224, "rewards/rejected": 0.948162853717804, "step": 6442 }, { "epoch": 3.48, "learning_rate": 4.447331521481212e-09, "logits/chosen": -2.1439714431762695, "logits/rejected": -2.1485049724578857, "logps/chosen": -4.191369533538818, "logps/rejected": -0.46411484479904175, "loss": 0.5619, "rewards/accuracies": 1.0, "rewards/chosen": 1.249049186706543, "rewards/margins": 0.2822501063346863, "rewards/rejected": 0.9667990803718567, "step": 6443 }, { "epoch": 3.48, "learning_rate": 4.438332374675791e-09, "logits/chosen": -2.181541681289673, "logits/rejected": -2.175861358642578, "logps/chosen": -3.465834617614746, "logps/rejected": -5.423871994018555, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 0.9054004549980164, "rewards/margins": 0.31070250272750854, "rewards/rejected": 0.5946979522705078, "step": 6444 }, { "epoch": 3.48, "learning_rate": 4.429341919060075e-09, "logits/chosen": -2.127347230911255, "logits/rejected": -2.168069839477539, "logps/chosen": -4.562032222747803, "logps/rejected": -8.457956314086914, "loss": 0.5667, "rewards/accuracies": 1.0, "rewards/chosen": 1.2280199527740479, "rewards/margins": 0.2711387872695923, "rewards/rejected": 0.9568811655044556, "step": 6445 }, { "epoch": 3.48, "learning_rate": 4.42036015634904e-09, "logits/chosen": -2.1139092445373535, "logits/rejected": -2.280552864074707, "logps/chosen": -0.3430171012878418, "logps/rejected": -0.39018213748931885, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8931862115859985, "rewards/margins": 0.01796668767929077, "rewards/rejected": 0.8752195239067078, "step": 6446 }, { "epoch": 3.48, "learning_rate": 4.411387088256035e-09, "logits/chosen": -2.1046009063720703, "logits/rejected": -2.0715456008911133, "logps/chosen": -10.26457405090332, "logps/rejected": -3.4761900901794434, "loss": 0.3048, "rewards/accuracies": 1.0, "rewards/chosen": 1.755122184753418, "rewards/margins": 1.031830072402954, "rewards/rejected": 0.7232920527458191, "step": 6447 }, { "epoch": 3.48, "learning_rate": 4.402422716492726e-09, "logits/chosen": -2.1273133754730225, "logits/rejected": -2.200948715209961, "logps/chosen": -13.88232421875, "logps/rejected": -11.50844955444336, "loss": 0.4684, "rewards/accuracies": 1.0, "rewards/chosen": 1.612866759300232, "rewards/margins": 0.5152143239974976, "rewards/rejected": 1.0976524353027344, "step": 6448 }, { "epoch": 3.48, "learning_rate": 4.393467042769128e-09, "logits/chosen": -2.1346347332000732, "logits/rejected": -2.139155864715576, "logps/chosen": -2.572042942047119, "logps/rejected": -5.6775102615356445, "loss": 0.4202, "rewards/accuracies": 1.0, "rewards/chosen": 1.0127586126327515, "rewards/margins": 0.6495176553726196, "rewards/rejected": 0.36324092745780945, "step": 6449 }, { "epoch": 3.48, "learning_rate": 4.38452006879359e-09, "logits/chosen": -2.1262154579162598, "logits/rejected": -2.1363115310668945, "logps/chosen": -1.1364600658416748, "logps/rejected": -1.8671942949295044, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 1.2288895845413208, "rewards/margins": 0.43834632635116577, "rewards/rejected": 0.790543258190155, "step": 6450 }, { "epoch": 3.48, "learning_rate": 4.375581796272809e-09, "logits/chosen": -2.054827928543091, "logits/rejected": -2.2783596515655518, "logps/chosen": -3.7387120723724365, "logps/rejected": -0.6265965700149536, "loss": 0.7275, "rewards/accuracies": 0.0, "rewards/chosen": 0.8396921157836914, "rewards/margins": -0.06750422716140747, "rewards/rejected": 0.9071963429450989, "step": 6451 }, { "epoch": 3.48, "learning_rate": 4.366652226911821e-09, "logits/chosen": -2.1134116649627686, "logits/rejected": -2.313016414642334, "logps/chosen": -0.3629370927810669, "logps/rejected": -0.3102208971977234, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 1.0439685583114624, "rewards/margins": 0.049698054790496826, "rewards/rejected": 0.9942705035209656, "step": 6452 }, { "epoch": 3.48, "learning_rate": 4.357731362413997e-09, "logits/chosen": -2.102024793624878, "logits/rejected": -2.086970567703247, "logps/chosen": -5.500354766845703, "logps/rejected": -8.365832328796387, "loss": 0.2585, "rewards/accuracies": 1.0, "rewards/chosen": 1.3789901733398438, "rewards/margins": 1.220949649810791, "rewards/rejected": 0.15804052352905273, "step": 6453 }, { "epoch": 3.48, "learning_rate": 4.348819204481058e-09, "logits/chosen": -2.055722951889038, "logits/rejected": -2.349856376647949, "logps/chosen": -1.966206669807434, "logps/rejected": -1.706227421760559, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 1.2275141477584839, "rewards/margins": 0.025450468063354492, "rewards/rejected": 1.2020636796951294, "step": 6454 }, { "epoch": 3.48, "learning_rate": 4.339915754813056e-09, "logits/chosen": -2.0432796478271484, "logits/rejected": -2.3054487705230713, "logps/chosen": -0.2842690646648407, "logps/rejected": -0.2913624942302704, "loss": 0.6643, "rewards/accuracies": 1.0, "rewards/chosen": 0.9743518829345703, "rewards/margins": 0.05856168270111084, "rewards/rejected": 0.9157902002334595, "step": 6455 }, { "epoch": 3.48, "learning_rate": 4.331021015108371e-09, "logits/chosen": -2.06843638420105, "logits/rejected": -2.069343328475952, "logps/chosen": -0.13851894438266754, "logps/rejected": -6.543689250946045, "loss": 0.3774, "rewards/accuracies": 1.0, "rewards/chosen": 1.0010508298873901, "rewards/margins": 0.7798483371734619, "rewards/rejected": 0.2212025225162506, "step": 6456 }, { "epoch": 3.48, "learning_rate": 4.3221349870637305e-09, "logits/chosen": -2.169849395751953, "logits/rejected": -2.206017255783081, "logps/chosen": -2.714479923248291, "logps/rejected": -11.110137939453125, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 1.2896063327789307, "rewards/margins": 0.47047293186187744, "rewards/rejected": 0.8191334009170532, "step": 6457 }, { "epoch": 3.48, "learning_rate": 4.313257672374226e-09, "logits/chosen": -1.9769370555877686, "logits/rejected": -1.9792125225067139, "logps/chosen": -3.5895650386810303, "logps/rejected": -1.8536456823349, "loss": 0.3671, "rewards/accuracies": 1.0, "rewards/chosen": 1.501434326171875, "rewards/margins": 0.8130287528038025, "rewards/rejected": 0.6884055733680725, "step": 6458 }, { "epoch": 3.48, "learning_rate": 4.30438907273325e-09, "logits/chosen": -2.069580316543579, "logits/rejected": -2.179901599884033, "logps/chosen": -0.2017102986574173, "logps/rejected": -0.2095339000225067, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.8333560824394226, "rewards/margins": 0.025440692901611328, "rewards/rejected": 0.8079153895378113, "step": 6459 }, { "epoch": 3.48, "learning_rate": 4.295529189832553e-09, "logits/chosen": -2.0369973182678223, "logits/rejected": -2.046152353286743, "logps/chosen": -1.1662429571151733, "logps/rejected": -4.28021764755249, "loss": 0.4299, "rewards/accuracies": 1.0, "rewards/chosen": 1.0306166410446167, "rewards/margins": 0.6215574741363525, "rewards/rejected": 0.40905919671058655, "step": 6460 }, { "epoch": 3.48, "learning_rate": 4.286678025362212e-09, "logits/chosen": -2.0039310455322266, "logits/rejected": -2.2911782264709473, "logps/chosen": -0.1421995609998703, "logps/rejected": -0.15875214338302612, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.8052069544792175, "rewards/margins": 0.0062220096588134766, "rewards/rejected": 0.798984944820404, "step": 6461 }, { "epoch": 3.49, "learning_rate": 4.277835581010641e-09, "logits/chosen": -2.069990873336792, "logits/rejected": -2.0759661197662354, "logps/chosen": -3.8349146842956543, "logps/rejected": -4.788254261016846, "loss": 0.3661, "rewards/accuracies": 1.0, "rewards/chosen": 1.2669219970703125, "rewards/margins": 0.8161449432373047, "rewards/rejected": 0.4507770240306854, "step": 6462 }, { "epoch": 3.49, "learning_rate": 4.269001858464599e-09, "logits/chosen": -2.018392562866211, "logits/rejected": -2.009026050567627, "logps/chosen": -4.713979244232178, "logps/rejected": -5.594994068145752, "loss": 0.403, "rewards/accuracies": 1.0, "rewards/chosen": 1.2068532705307007, "rewards/margins": 0.7005624175071716, "rewards/rejected": 0.506290853023529, "step": 6463 }, { "epoch": 3.49, "learning_rate": 4.260176859409181e-09, "logits/chosen": -2.097615957260132, "logits/rejected": -2.0963380336761475, "logps/chosen": -5.424173355102539, "logps/rejected": -6.124712944030762, "loss": 0.2762, "rewards/accuracies": 1.0, "rewards/chosen": 1.6775903701782227, "rewards/margins": 1.1452782154083252, "rewards/rejected": 0.5323120951652527, "step": 6464 }, { "epoch": 3.49, "learning_rate": 4.251360585527808e-09, "logits/chosen": -2.003596305847168, "logits/rejected": -1.9987531900405884, "logps/chosen": -3.036489963531494, "logps/rejected": -2.4752700328826904, "loss": 0.3795, "rewards/accuracies": 1.0, "rewards/chosen": 1.4631630182266235, "rewards/margins": 0.7730361223220825, "rewards/rejected": 0.690126895904541, "step": 6465 }, { "epoch": 3.49, "learning_rate": 4.242553038502244e-09, "logits/chosen": -2.0756006240844727, "logits/rejected": -2.082547426223755, "logps/chosen": -2.3004114627838135, "logps/rejected": -4.488753318786621, "loss": 0.5781, "rewards/accuracies": 1.0, "rewards/chosen": 0.9435696005821228, "rewards/margins": 0.24497437477111816, "rewards/rejected": 0.6985952258110046, "step": 6466 }, { "epoch": 3.49, "learning_rate": 4.233754220012592e-09, "logits/chosen": -1.988588809967041, "logits/rejected": -2.306884288787842, "logps/chosen": -0.15735259652137756, "logps/rejected": -0.1796366274356842, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.9452821612358093, "rewards/margins": 0.030730128288269043, "rewards/rejected": 0.9145520329475403, "step": 6467 }, { "epoch": 3.49, "learning_rate": 4.22496413173728e-09, "logits/chosen": -2.0202090740203857, "logits/rejected": -2.3079919815063477, "logps/chosen": -2.480006217956543, "logps/rejected": -2.280909538269043, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.5343036651611328, "rewards/margins": 0.023308098316192627, "rewards/rejected": 0.5109955668449402, "step": 6468 }, { "epoch": 3.49, "learning_rate": 4.216182775353072e-09, "logits/chosen": -2.0940675735473633, "logits/rejected": -2.099456548690796, "logps/chosen": -2.335045337677002, "logps/rejected": -4.438901901245117, "loss": 0.4924, "rewards/accuracies": 1.0, "rewards/chosen": 1.0570850372314453, "rewards/margins": 0.45229995250701904, "rewards/rejected": 0.6047850847244263, "step": 6469 }, { "epoch": 3.49, "learning_rate": 4.2074101525350795e-09, "logits/chosen": -2.1464953422546387, "logits/rejected": -2.2799887657165527, "logps/chosen": -0.9274306297302246, "logps/rejected": -0.6732237339019775, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 1.0941475629806519, "rewards/margins": 0.029322147369384766, "rewards/rejected": 1.064825415611267, "step": 6470 }, { "epoch": 3.49, "learning_rate": 4.198646264956729e-09, "logits/chosen": -2.0945918560028076, "logits/rejected": -2.107300281524658, "logps/chosen": -4.595171928405762, "logps/rejected": -5.366737365722656, "loss": 0.2978, "rewards/accuracies": 1.0, "rewards/chosen": 1.9290589094161987, "rewards/margins": 1.0586602687835693, "rewards/rejected": 0.8703987002372742, "step": 6471 }, { "epoch": 3.49, "learning_rate": 4.18989111428979e-09, "logits/chosen": -2.0720887184143066, "logits/rejected": -2.3700006008148193, "logps/chosen": -0.20626291632652283, "logps/rejected": -0.4807406961917877, "loss": 0.667, "rewards/accuracies": 1.0, "rewards/chosen": 0.8333298563957214, "rewards/margins": 0.05308842658996582, "rewards/rejected": 0.7802414298057556, "step": 6472 }, { "epoch": 3.49, "learning_rate": 4.181144702204375e-09, "logits/chosen": -2.1206233501434326, "logits/rejected": -2.3028547763824463, "logps/chosen": -0.5351366400718689, "logps/rejected": -0.5258779525756836, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 1.0018506050109863, "rewards/margins": 0.025638580322265625, "rewards/rejected": 0.9762120246887207, "step": 6473 }, { "epoch": 3.49, "learning_rate": 4.172407030368913e-09, "logits/chosen": -2.0916504859924316, "logits/rejected": -2.4013102054595947, "logps/chosen": -0.5558817982673645, "logps/rejected": -0.5119649171829224, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.9967284202575684, "rewards/margins": 0.006890356540679932, "rewards/rejected": 0.9898380637168884, "step": 6474 }, { "epoch": 3.49, "learning_rate": 4.163678100450174e-09, "logits/chosen": -2.0632987022399902, "logits/rejected": -2.0683538913726807, "logps/chosen": -1.7919015884399414, "logps/rejected": -3.38852596282959, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 1.0665658712387085, "rewards/margins": 0.47964292764663696, "rewards/rejected": 0.5869229435920715, "step": 6475 }, { "epoch": 3.49, "learning_rate": 4.154957914113261e-09, "logits/chosen": -2.13154935836792, "logits/rejected": -2.3171887397766113, "logps/chosen": -1.4523961544036865, "logps/rejected": -1.4875081777572632, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.7307273149490356, "rewards/margins": 0.00018936395645141602, "rewards/rejected": 0.7305379509925842, "step": 6476 }, { "epoch": 3.49, "learning_rate": 4.146246473021603e-09, "logits/chosen": -2.1717214584350586, "logits/rejected": -2.3624536991119385, "logps/chosen": -9.670522689819336, "logps/rejected": -10.49959945678711, "loss": 0.6233, "rewards/accuracies": 1.0, "rewards/chosen": 1.2529172897338867, "rewards/margins": 0.14489495754241943, "rewards/rejected": 1.1080223321914673, "step": 6477 }, { "epoch": 3.49, "learning_rate": 4.137543778836966e-09, "logits/chosen": -2.1437907218933105, "logits/rejected": -2.1466782093048096, "logps/chosen": -1.270789623260498, "logps/rejected": -4.05295467376709, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0343624353408813, "rewards/margins": 0.4805682301521301, "rewards/rejected": 0.5537942051887512, "step": 6478 }, { "epoch": 3.49, "learning_rate": 4.128849833219455e-09, "logits/chosen": -2.141507863998413, "logits/rejected": -2.1337223052978516, "logps/chosen": -12.154577255249023, "logps/rejected": -1.080977201461792, "loss": 0.4096, "rewards/accuracies": 1.0, "rewards/chosen": 1.4312070608139038, "rewards/margins": 0.6808478832244873, "rewards/rejected": 0.7503591775894165, "step": 6479 }, { "epoch": 3.5, "learning_rate": 4.120164637827478e-09, "logits/chosen": -2.092388153076172, "logits/rejected": -2.0922629833221436, "logps/chosen": -0.4095349907875061, "logps/rejected": -3.106693744659424, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 0.8359290361404419, "rewards/margins": 0.2801169753074646, "rewards/rejected": 0.5558120608329773, "step": 6480 }, { "epoch": 3.5, "learning_rate": 4.111488194317814e-09, "logits/chosen": -2.1308045387268066, "logits/rejected": -2.1337954998016357, "logps/chosen": -0.466842919588089, "logps/rejected": -8.741669654846191, "loss": 0.3902, "rewards/accuracies": 1.0, "rewards/chosen": 1.2049564123153687, "rewards/margins": 0.7395334839820862, "rewards/rejected": 0.46542292833328247, "step": 6481 }, { "epoch": 3.5, "learning_rate": 4.102820504345544e-09, "logits/chosen": -2.01839542388916, "logits/rejected": -2.3251702785491943, "logps/chosen": -0.32397884130477905, "logps/rejected": -0.36234065890312195, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.9613885879516602, "rewards/margins": 0.03192329406738281, "rewards/rejected": 0.9294652938842773, "step": 6482 }, { "epoch": 3.5, "learning_rate": 4.094161569564087e-09, "logits/chosen": -2.138209819793701, "logits/rejected": -2.3027396202087402, "logps/chosen": -0.7910991311073303, "logps/rejected": -0.7768170237541199, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 1.0139740705490112, "rewards/margins": -0.00449371337890625, "rewards/rejected": 1.0184677839279175, "step": 6483 }, { "epoch": 3.5, "learning_rate": 4.085511391625207e-09, "logits/chosen": -2.085730791091919, "logits/rejected": -2.2745325565338135, "logps/chosen": -2.2039296627044678, "logps/rejected": -7.671499252319336, "loss": 0.5766, "rewards/accuracies": 1.0, "rewards/chosen": 0.8133875727653503, "rewards/margins": 0.24843060970306396, "rewards/rejected": 0.5649569630622864, "step": 6484 }, { "epoch": 3.5, "learning_rate": 4.076869972178954e-09, "logits/chosen": -2.1099865436553955, "logits/rejected": -2.3036608695983887, "logps/chosen": -0.1541500836610794, "logps/rejected": -0.1829252690076828, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.8359455466270447, "rewards/margins": 0.007552742958068848, "rewards/rejected": 0.8283928036689758, "step": 6485 }, { "epoch": 3.5, "learning_rate": 4.068237312873751e-09, "logits/chosen": -2.1524815559387207, "logits/rejected": -2.1523492336273193, "logps/chosen": -4.645996570587158, "logps/rejected": -2.570722818374634, "loss": 0.2473, "rewards/accuracies": 1.0, "rewards/chosen": 1.8294063806533813, "rewards/margins": 1.270827293395996, "rewards/rejected": 0.5585790872573853, "step": 6486 }, { "epoch": 3.5, "learning_rate": 4.059613415356339e-09, "logits/chosen": -2.032789707183838, "logits/rejected": -2.0115230083465576, "logps/chosen": -3.592660427093506, "logps/rejected": -7.112759113311768, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 1.128996729850769, "rewards/margins": 0.693079948425293, "rewards/rejected": 0.4359167516231537, "step": 6487 }, { "epoch": 3.5, "learning_rate": 4.050998281271773e-09, "logits/chosen": -2.134230613708496, "logits/rejected": -2.353774070739746, "logps/chosen": -0.8399085402488708, "logps/rejected": -0.9129687547683716, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 1.0418550968170166, "rewards/margins": 0.00906062126159668, "rewards/rejected": 1.03279447555542, "step": 6488 }, { "epoch": 3.5, "learning_rate": 4.042391912263455e-09, "logits/chosen": -2.1760671138763428, "logits/rejected": -2.1561837196350098, "logps/chosen": -12.093039512634277, "logps/rejected": -1.2386107444763184, "loss": 0.3974, "rewards/accuracies": 1.0, "rewards/chosen": 1.6875801086425781, "rewards/margins": 0.717442512512207, "rewards/rejected": 0.9701375961303711, "step": 6489 }, { "epoch": 3.5, "learning_rate": 4.033794309973105e-09, "logits/chosen": -2.1665613651275635, "logits/rejected": -2.156493663787842, "logps/chosen": -4.689592361450195, "logps/rejected": -4.848420143127441, "loss": 0.6052, "rewards/accuracies": 1.0, "rewards/chosen": 0.9210333824157715, "rewards/margins": 0.18443870544433594, "rewards/rejected": 0.7365946769714355, "step": 6490 }, { "epoch": 3.5, "learning_rate": 4.0252054760407605e-09, "logits/chosen": -2.057648181915283, "logits/rejected": -2.052774429321289, "logps/chosen": -1.93569815158844, "logps/rejected": -7.041347026824951, "loss": 0.4163, "rewards/accuracies": 1.0, "rewards/chosen": 1.3053245544433594, "rewards/margins": 0.6609522104263306, "rewards/rejected": 0.6443723440170288, "step": 6491 }, { "epoch": 3.5, "learning_rate": 4.016625412104824e-09, "logits/chosen": -2.086697816848755, "logits/rejected": -2.086324691772461, "logps/chosen": -0.8102987408638, "logps/rejected": -3.987931489944458, "loss": 0.4912, "rewards/accuracies": 1.0, "rewards/chosen": 1.1868174076080322, "rewards/margins": 0.45525914430618286, "rewards/rejected": 0.7315582633018494, "step": 6492 }, { "epoch": 3.5, "learning_rate": 4.008054119801984e-09, "logits/chosen": -2.222261905670166, "logits/rejected": -2.111175537109375, "logps/chosen": -16.144498825073242, "logps/rejected": -2.1667909622192383, "loss": 0.1777, "rewards/accuracies": 1.0, "rewards/chosen": 2.3791444301605225, "rewards/margins": 1.637458324432373, "rewards/rejected": 0.7416860461235046, "step": 6493 }, { "epoch": 3.5, "learning_rate": 3.999491600767268e-09, "logits/chosen": -2.1873772144317627, "logits/rejected": -2.1848182678222656, "logps/chosen": -5.840452194213867, "logps/rejected": -3.879204750061035, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 1.9093172550201416, "rewards/margins": 1.4327502250671387, "rewards/rejected": 0.47656700015068054, "step": 6494 }, { "epoch": 3.5, "learning_rate": 3.9909378566340415e-09, "logits/chosen": -2.1309778690338135, "logits/rejected": -2.131298780441284, "logps/chosen": -1.8266282081604004, "logps/rejected": -1.417959451675415, "loss": 0.4745, "rewards/accuracies": 1.0, "rewards/chosen": 1.2485764026641846, "rewards/margins": 0.4988095760345459, "rewards/rejected": 0.7497668266296387, "step": 6495 }, { "epoch": 3.5, "learning_rate": 3.9823928890339864e-09, "logits/chosen": -2.1493875980377197, "logits/rejected": -2.284212350845337, "logps/chosen": -0.3229556679725647, "logps/rejected": -0.3578298091888428, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.9518820643424988, "rewards/margins": 0.010571956634521484, "rewards/rejected": 0.9413101077079773, "step": 6496 }, { "epoch": 3.5, "learning_rate": 3.9738566995971055e-09, "logits/chosen": -2.030494213104248, "logits/rejected": -2.031015396118164, "logps/chosen": -1.748482584953308, "logps/rejected": -5.637584686279297, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 1.5402101278305054, "rewards/margins": 1.0501902103424072, "rewards/rejected": 0.49001988768577576, "step": 6497 }, { "epoch": 3.5, "learning_rate": 3.965329289951741e-09, "logits/chosen": -2.196096420288086, "logits/rejected": -2.1938107013702393, "logps/chosen": -2.9781923294067383, "logps/rejected": -5.655439853668213, "loss": 0.2095, "rewards/accuracies": 1.0, "rewards/chosen": 1.7858985662460327, "rewards/margins": 1.4565045833587646, "rewards/rejected": 0.32939401268959045, "step": 6498 }, { "epoch": 3.51, "learning_rate": 3.956810661724547e-09, "logits/chosen": -2.0881855487823486, "logits/rejected": -2.320084810256958, "logps/chosen": -5.033899307250977, "logps/rejected": -1.0501227378845215, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.7459791302680969, "rewards/margins": -0.00014221668243408203, "rewards/rejected": 0.746121346950531, "step": 6499 }, { "epoch": 3.51, "learning_rate": 3.948300816540512e-09, "logits/chosen": -2.212476968765259, "logits/rejected": -2.2472782135009766, "logps/chosen": -8.123576164245605, "logps/rejected": -9.28188419342041, "loss": 0.6318, "rewards/accuracies": 1.0, "rewards/chosen": 1.2554467916488647, "rewards/margins": 0.12668442726135254, "rewards/rejected": 1.1287623643875122, "step": 6500 }, { "epoch": 3.51, "learning_rate": 3.9397997560229435e-09, "logits/chosen": -2.118988513946533, "logits/rejected": -2.119292974472046, "logps/chosen": -1.0188194513320923, "logps/rejected": -1.22150719165802, "loss": 0.6054, "rewards/accuracies": 1.0, "rewards/chosen": 1.0201938152313232, "rewards/margins": 0.1840086579322815, "rewards/rejected": 0.8361851572990417, "step": 6501 }, { "epoch": 3.51, "learning_rate": 3.931307481793478e-09, "logits/chosen": -2.14707088470459, "logits/rejected": -2.309126138687134, "logps/chosen": -0.3006126880645752, "logps/rejected": -0.32880955934524536, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.9168734550476074, "rewards/margins": -0.001282036304473877, "rewards/rejected": 0.9181554913520813, "step": 6502 }, { "epoch": 3.51, "learning_rate": 3.922823995472069e-09, "logits/chosen": -2.0078327655792236, "logits/rejected": -2.297104597091675, "logps/chosen": -0.29468533396720886, "logps/rejected": -0.309848815202713, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.852124035358429, "rewards/margins": 0.006045937538146973, "rewards/rejected": 0.846078097820282, "step": 6503 }, { "epoch": 3.51, "learning_rate": 3.914349298677e-09, "logits/chosen": -2.143799066543579, "logits/rejected": -2.1371026039123535, "logps/chosen": -1.6710892915725708, "logps/rejected": -3.756373643875122, "loss": 0.7645, "rewards/accuracies": 0.0, "rewards/chosen": 0.8470731973648071, "rewards/margins": -0.1379566788673401, "rewards/rejected": 0.9850298762321472, "step": 6504 }, { "epoch": 3.51, "learning_rate": 3.905883393024873e-09, "logits/chosen": -2.1071693897247314, "logits/rejected": -2.1102688312530518, "logps/chosen": -1.7176513671875, "logps/rejected": -6.142965316772461, "loss": 0.3558, "rewards/accuracies": 1.0, "rewards/chosen": 1.5299713611602783, "rewards/margins": 0.8503382802009583, "rewards/rejected": 0.6796330809593201, "step": 6505 }, { "epoch": 3.51, "learning_rate": 3.897426280130617e-09, "logits/chosen": -2.1027207374572754, "logits/rejected": -2.1027486324310303, "logps/chosen": -1.3131130933761597, "logps/rejected": -2.336245059967041, "loss": 0.5717, "rewards/accuracies": 1.0, "rewards/chosen": 1.1205910444259644, "rewards/margins": 0.25970715284347534, "rewards/rejected": 0.860883891582489, "step": 6506 }, { "epoch": 3.51, "learning_rate": 3.888977961607481e-09, "logits/chosen": -2.099738597869873, "logits/rejected": -2.340684175491333, "logps/chosen": -0.1681884229183197, "logps/rejected": -0.107975535094738, "loss": 0.7242, "rewards/accuracies": 0.0, "rewards/chosen": 1.0243990421295166, "rewards/margins": -0.0611034631729126, "rewards/rejected": 1.0855025053024292, "step": 6507 }, { "epoch": 3.51, "learning_rate": 3.880538439067038e-09, "logits/chosen": -2.1427533626556396, "logits/rejected": -2.1429669857025146, "logps/chosen": -2.090932846069336, "logps/rejected": -3.474142074584961, "loss": 0.5036, "rewards/accuracies": 1.0, "rewards/chosen": 1.0169062614440918, "rewards/margins": 0.4236622452735901, "rewards/rejected": 0.5932440161705017, "step": 6508 }, { "epoch": 3.51, "learning_rate": 3.872107714119188e-09, "logits/chosen": -2.0724594593048096, "logits/rejected": -2.0773134231567383, "logps/chosen": -2.874967575073242, "logps/rejected": -5.46198034286499, "loss": 0.3361, "rewards/accuracies": 1.0, "rewards/chosen": 1.3663710355758667, "rewards/margins": 0.9175637364387512, "rewards/rejected": 0.4488072991371155, "step": 6509 }, { "epoch": 3.51, "learning_rate": 3.863685788372145e-09, "logits/chosen": -2.119485378265381, "logits/rejected": -2.289729595184326, "logps/chosen": -0.4031793773174286, "logps/rejected": -0.41158315539360046, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.8413829207420349, "rewards/margins": -0.005172312259674072, "rewards/rejected": 0.846555233001709, "step": 6510 }, { "epoch": 3.51, "learning_rate": 3.855272663432441e-09, "logits/chosen": -2.002516984939575, "logits/rejected": -2.010993003845215, "logps/chosen": -1.4670569896697998, "logps/rejected": -2.4728636741638184, "loss": 0.4779, "rewards/accuracies": 1.0, "rewards/chosen": 1.102787733078003, "rewards/margins": 0.48980361223220825, "rewards/rejected": 0.6129841208457947, "step": 6511 }, { "epoch": 3.51, "learning_rate": 3.846868340904946e-09, "logits/chosen": -2.1234753131866455, "logits/rejected": -2.287217855453491, "logps/chosen": -0.2356482446193695, "logps/rejected": -0.2275700569152832, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.8605592846870422, "rewards/margins": 0.03311920166015625, "rewards/rejected": 0.827440083026886, "step": 6512 }, { "epoch": 3.51, "learning_rate": 3.8384728223928185e-09, "logits/chosen": -2.0224149227142334, "logits/rejected": -2.294722318649292, "logps/chosen": -0.22838202118873596, "logps/rejected": -0.2675984501838684, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 1.0165767669677734, "rewards/margins": 0.010752201080322266, "rewards/rejected": 1.0058245658874512, "step": 6513 }, { "epoch": 3.51, "learning_rate": 3.830086109497599e-09, "logits/chosen": -2.0970757007598877, "logits/rejected": -2.091912031173706, "logps/chosen": -3.4330239295959473, "logps/rejected": -4.230285167694092, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 1.051841139793396, "rewards/margins": 0.46164625883102417, "rewards/rejected": 0.5901948809623718, "step": 6514 }, { "epoch": 3.51, "learning_rate": 3.821708203819069e-09, "logits/chosen": -2.0935513973236084, "logits/rejected": -2.3859078884124756, "logps/chosen": -1.3766717910766602, "logps/rejected": -1.1837180852890015, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.7897023558616638, "rewards/margins": 0.010384440422058105, "rewards/rejected": 0.7793179154396057, "step": 6515 }, { "epoch": 3.51, "learning_rate": 3.8133391069553876e-09, "logits/chosen": -2.0054478645324707, "logits/rejected": -2.013507843017578, "logps/chosen": -1.4091434478759766, "logps/rejected": -2.678483486175537, "loss": 0.5049, "rewards/accuracies": 1.0, "rewards/chosen": 1.0631636381149292, "rewards/margins": 0.42039424180984497, "rewards/rejected": 0.6427693963050842, "step": 6516 }, { "epoch": 3.52, "learning_rate": 3.804978820503013e-09, "logits/chosen": -2.1680350303649902, "logits/rejected": -2.196678638458252, "logps/chosen": -3.9815611839294434, "logps/rejected": -20.886259078979492, "loss": 0.1883, "rewards/accuracies": 1.0, "rewards/chosen": 1.7434924840927124, "rewards/margins": 1.5739362239837646, "rewards/rejected": 0.16955624520778656, "step": 6517 }, { "epoch": 3.52, "learning_rate": 3.7966273460567245e-09, "logits/chosen": -2.126629114151001, "logits/rejected": -2.324946880340576, "logps/chosen": -0.8852524757385254, "logps/rejected": -1.0315959453582764, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.8012441992759705, "rewards/margins": 0.019403398036956787, "rewards/rejected": 0.7818408012390137, "step": 6518 }, { "epoch": 3.52, "learning_rate": 3.78828468520962e-09, "logits/chosen": -2.2683017253875732, "logits/rejected": -2.4221458435058594, "logps/chosen": -0.43490180373191833, "logps/rejected": -0.48187562823295593, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8726134300231934, "rewards/margins": 0.0017401576042175293, "rewards/rejected": 0.8708732724189758, "step": 6519 }, { "epoch": 3.52, "learning_rate": 3.77995083955312e-09, "logits/chosen": -2.123830795288086, "logits/rejected": -2.097517251968384, "logps/chosen": -14.237508773803711, "logps/rejected": -4.145427703857422, "loss": 0.1675, "rewards/accuracies": 1.0, "rewards/chosen": 2.3860056400299072, "rewards/margins": 1.7015860080718994, "rewards/rejected": 0.6844196319580078, "step": 6520 }, { "epoch": 3.52, "learning_rate": 3.771625810676959e-09, "logits/chosen": -2.169588565826416, "logits/rejected": -2.320988178253174, "logps/chosen": -4.59678840637207, "logps/rejected": -0.8495184183120728, "loss": 0.7205, "rewards/accuracies": 0.0, "rewards/chosen": 0.9576092958450317, "rewards/margins": -0.054045915603637695, "rewards/rejected": 1.0116552114486694, "step": 6521 }, { "epoch": 3.52, "learning_rate": 3.7633096001691954e-09, "logits/chosen": -2.0123775005340576, "logits/rejected": -2.0036818981170654, "logps/chosen": -4.449418067932129, "logps/rejected": -5.703582286834717, "loss": 0.2611, "rewards/accuracies": 1.0, "rewards/chosen": 1.6358520984649658, "rewards/margins": 1.2094037532806396, "rewards/rejected": 0.42644840478897095, "step": 6522 }, { "epoch": 3.52, "learning_rate": 3.7550022096161924e-09, "logits/chosen": -2.0413150787353516, "logits/rejected": -2.269742488861084, "logps/chosen": -7.742038726806641, "logps/rejected": -6.266889572143555, "loss": 0.7374, "rewards/accuracies": 0.0, "rewards/chosen": 0.5541208386421204, "rewards/margins": -0.08669912815093994, "rewards/rejected": 0.6408199667930603, "step": 6523 }, { "epoch": 3.52, "learning_rate": 3.746703640602639e-09, "logits/chosen": -2.1723592281341553, "logits/rejected": -2.171452283859253, "logps/chosen": -0.7835354208946228, "logps/rejected": -7.051710605621338, "loss": 0.5616, "rewards/accuracies": 1.0, "rewards/chosen": 1.1117010116577148, "rewards/margins": 0.28300589323043823, "rewards/rejected": 0.8286951184272766, "step": 6524 }, { "epoch": 3.52, "learning_rate": 3.738413894711556e-09, "logits/chosen": -2.057948589324951, "logits/rejected": -2.297565221786499, "logps/chosen": -3.424151659011841, "logps/rejected": -3.3177378177642822, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 1.1716669797897339, "rewards/margins": 0.02405571937561035, "rewards/rejected": 1.1476112604141235, "step": 6525 }, { "epoch": 3.52, "learning_rate": 3.730132973524263e-09, "logits/chosen": -2.0548899173736572, "logits/rejected": -2.2594621181488037, "logps/chosen": -0.8476755619049072, "logps/rejected": -0.7891528606414795, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 1.0160611867904663, "rewards/margins": 0.009511947631835938, "rewards/rejected": 1.0065492391586304, "step": 6526 }, { "epoch": 3.52, "learning_rate": 3.7218608786203976e-09, "logits/chosen": -2.0228044986724854, "logits/rejected": -2.0306007862091064, "logps/chosen": -1.336498498916626, "logps/rejected": -2.992156505584717, "loss": 0.5317, "rewards/accuracies": 1.0, "rewards/chosen": 1.0395876169204712, "rewards/margins": 0.3541123867034912, "rewards/rejected": 0.68547523021698, "step": 6527 }, { "epoch": 3.52, "learning_rate": 3.7135976115779123e-09, "logits/chosen": -2.167513370513916, "logits/rejected": -2.313703775405884, "logps/chosen": -0.5076236128807068, "logps/rejected": -2.2890679836273193, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 1.138453722000122, "rewards/margins": 0.03603482246398926, "rewards/rejected": 1.1024188995361328, "step": 6528 }, { "epoch": 3.52, "learning_rate": 3.7053431739730856e-09, "logits/chosen": -2.070011854171753, "logits/rejected": -2.2931571006774902, "logps/chosen": -0.6993467807769775, "logps/rejected": -0.734064519405365, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 1.082924723625183, "rewards/margins": 0.005564689636230469, "rewards/rejected": 1.0773600339889526, "step": 6529 }, { "epoch": 3.52, "learning_rate": 3.6970975673805028e-09, "logits/chosen": -2.16540265083313, "logits/rejected": -2.312922954559326, "logps/chosen": -4.359081745147705, "logps/rejected": -4.207273483276367, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.9013187289237976, "rewards/margins": 0.039333999156951904, "rewards/rejected": 0.8619847297668457, "step": 6530 }, { "epoch": 3.52, "learning_rate": 3.6888607933730775e-09, "logits/chosen": -2.11257004737854, "logits/rejected": -2.316770076751709, "logps/chosen": -0.5628808736801147, "logps/rejected": -0.6257586479187012, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.7123969793319702, "rewards/margins": 0.0188063383102417, "rewards/rejected": 0.6935906410217285, "step": 6531 }, { "epoch": 3.52, "learning_rate": 3.6806328535220143e-09, "logits/chosen": -2.1290249824523926, "logits/rejected": -2.309873580932617, "logps/chosen": -3.3558287620544434, "logps/rejected": -1.1252758502960205, "loss": 0.7075, "rewards/accuracies": 0.0, "rewards/chosen": 1.108939528465271, "rewards/margins": -0.028411865234375, "rewards/rejected": 1.137351393699646, "step": 6532 }, { "epoch": 3.52, "learning_rate": 3.6724137493968523e-09, "logits/chosen": -1.9978524446487427, "logits/rejected": -2.2670390605926514, "logps/chosen": -1.5378272533416748, "logps/rejected": -1.5140416622161865, "loss": 0.7055, "rewards/accuracies": 0.0, "rewards/chosen": 0.9872954487800598, "rewards/margins": -0.024644315242767334, "rewards/rejected": 1.0119397640228271, "step": 6533 }, { "epoch": 3.52, "learning_rate": 3.6642034825654424e-09, "logits/chosen": -2.1572425365448, "logits/rejected": -2.313992977142334, "logps/chosen": -0.8806089162826538, "logps/rejected": -0.8179450631141663, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8547317385673523, "rewards/margins": 0.02581787109375, "rewards/rejected": 0.8289138674736023, "step": 6534 }, { "epoch": 3.52, "learning_rate": 3.6560020545939486e-09, "logits/chosen": -2.0006165504455566, "logits/rejected": -2.0065460205078125, "logps/chosen": -5.892849922180176, "logps/rejected": -0.9984049797058105, "loss": 0.2498, "rewards/accuracies": 1.0, "rewards/chosen": 1.8377662897109985, "rewards/margins": 1.2597591876983643, "rewards/rejected": 0.5780071020126343, "step": 6535 }, { "epoch": 3.53, "learning_rate": 3.647809467046836e-09, "logits/chosen": -2.05290150642395, "logits/rejected": -2.2806804180145264, "logps/chosen": -1.2577283382415771, "logps/rejected": -7.337509632110596, "loss": 0.5222, "rewards/accuracies": 1.0, "rewards/chosen": 1.2763570547103882, "rewards/margins": 0.3771951198577881, "rewards/rejected": 0.8991619348526001, "step": 6536 }, { "epoch": 3.53, "learning_rate": 3.6396257214869106e-09, "logits/chosen": -1.971289038658142, "logits/rejected": -2.264075517654419, "logps/chosen": -6.5285539627075195, "logps/rejected": -1.168813705444336, "loss": 0.546, "rewards/accuracies": 1.0, "rewards/chosen": 1.2440637350082397, "rewards/margins": 0.3197522759437561, "rewards/rejected": 0.9243114590644836, "step": 6537 }, { "epoch": 3.53, "learning_rate": 3.6314508194752737e-09, "logits/chosen": -2.086094617843628, "logits/rejected": -2.115438938140869, "logps/chosen": -1.6338884830474854, "logps/rejected": -8.613041877746582, "loss": 0.2935, "rewards/accuracies": 1.0, "rewards/chosen": 1.7177906036376953, "rewards/margins": 1.0755603313446045, "rewards/rejected": 0.6422303318977356, "step": 6538 }, { "epoch": 3.53, "learning_rate": 3.623284762571338e-09, "logits/chosen": -1.9868746995925903, "logits/rejected": -2.3179948329925537, "logps/chosen": -0.16714918613433838, "logps/rejected": -0.17642822861671448, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.9666360020637512, "rewards/margins": -0.007657825946807861, "rewards/rejected": 0.9742938280105591, "step": 6539 }, { "epoch": 3.53, "learning_rate": 3.6151275523328306e-09, "logits/chosen": -2.0456647872924805, "logits/rejected": -2.0548555850982666, "logps/chosen": -3.5469937324523926, "logps/rejected": -6.5085768699646, "loss": 0.3751, "rewards/accuracies": 1.0, "rewards/chosen": 1.0288151502609253, "rewards/margins": 0.7871384620666504, "rewards/rejected": 0.2416766732931137, "step": 6540 }, { "epoch": 3.53, "learning_rate": 3.6069791903157954e-09, "logits/chosen": -2.1968460083007812, "logits/rejected": -2.318645715713501, "logps/chosen": -0.8662055730819702, "logps/rejected": -6.034533977508545, "loss": 0.5976, "rewards/accuracies": 1.0, "rewards/chosen": 1.1034196615219116, "rewards/margins": 0.20110392570495605, "rewards/rejected": 0.9023157358169556, "step": 6541 }, { "epoch": 3.53, "learning_rate": 3.5988396780745834e-09, "logits/chosen": -2.0282368659973145, "logits/rejected": -2.291778326034546, "logps/chosen": -4.517996788024902, "logps/rejected": -2.4708452224731445, "loss": 0.7335, "rewards/accuracies": 0.0, "rewards/chosen": 0.5990273356437683, "rewards/margins": -0.07917016744613647, "rewards/rejected": 0.6781975030899048, "step": 6542 }, { "epoch": 3.53, "learning_rate": 3.5907090171618694e-09, "logits/chosen": -2.0905675888061523, "logits/rejected": -2.2877814769744873, "logps/chosen": -0.5928936004638672, "logps/rejected": -0.6296924352645874, "loss": 0.6595, "rewards/accuracies": 1.0, "rewards/chosen": 0.9818863868713379, "rewards/margins": 0.068378746509552, "rewards/rejected": 0.9135076403617859, "step": 6543 }, { "epoch": 3.53, "learning_rate": 3.5825872091286234e-09, "logits/chosen": -2.0466983318328857, "logits/rejected": -2.3016903400421143, "logps/chosen": -0.37831515073776245, "logps/rejected": -0.39878392219543457, "loss": 0.6622, "rewards/accuracies": 1.0, "rewards/chosen": 0.9989633560180664, "rewards/margins": 0.06283938884735107, "rewards/rejected": 0.9361239671707153, "step": 6544 }, { "epoch": 3.53, "learning_rate": 3.5744742555241347e-09, "logits/chosen": -1.9899593591690063, "logits/rejected": -1.9901518821716309, "logps/chosen": -0.967737078666687, "logps/rejected": -1.1395256519317627, "loss": 0.6424, "rewards/accuracies": 1.0, "rewards/chosen": 0.9395291209220886, "rewards/margins": 0.10422343015670776, "rewards/rejected": 0.8353056907653809, "step": 6545 }, { "epoch": 3.53, "learning_rate": 3.5663701578960093e-09, "logits/chosen": -2.170931100845337, "logits/rejected": -2.3124117851257324, "logps/chosen": -1.2349108457565308, "logps/rejected": -1.132419466972351, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.6664178371429443, "rewards/margins": -0.00504910945892334, "rewards/rejected": 0.6714669466018677, "step": 6546 }, { "epoch": 3.53, "learning_rate": 3.5582749177901493e-09, "logits/chosen": -2.0237972736358643, "logits/rejected": -2.033123016357422, "logps/chosen": -1.212712049484253, "logps/rejected": -3.0400726795196533, "loss": 0.4838, "rewards/accuracies": 1.0, "rewards/chosen": 1.1054338216781616, "rewards/margins": 0.47441840171813965, "rewards/rejected": 0.631015419960022, "step": 6547 }, { "epoch": 3.53, "learning_rate": 3.5501885367507756e-09, "logits/chosen": -2.101066827774048, "logits/rejected": -2.302940607070923, "logps/chosen": -2.1607768535614014, "logps/rejected": -6.648679733276367, "loss": 0.5772, "rewards/accuracies": 1.0, "rewards/chosen": 0.7304245233535767, "rewards/margins": 0.24702006578445435, "rewards/rejected": 0.4834044575691223, "step": 6548 }, { "epoch": 3.53, "learning_rate": 3.5421110163204205e-09, "logits/chosen": -2.0565919876098633, "logits/rejected": -2.0344178676605225, "logps/chosen": -6.625728607177734, "logps/rejected": -6.285220146179199, "loss": 0.3469, "rewards/accuracies": 1.0, "rewards/chosen": 1.3720064163208008, "rewards/margins": 0.8803454637527466, "rewards/rejected": 0.4916609823703766, "step": 6549 }, { "epoch": 3.53, "learning_rate": 3.534042358039929e-09, "logits/chosen": -2.0558419227600098, "logits/rejected": -2.1141955852508545, "logps/chosen": -0.9735075831413269, "logps/rejected": -8.469705581665039, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 1.5598138570785522, "rewards/margins": 0.6775023937225342, "rewards/rejected": 0.8823114633560181, "step": 6550 }, { "epoch": 3.53, "learning_rate": 3.5259825634484486e-09, "logits/chosen": -2.0689961910247803, "logits/rejected": -2.3378405570983887, "logps/chosen": -1.1233361959457397, "logps/rejected": -3.7008917331695557, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 1.0604976415634155, "rewards/margins": 0.013718485832214355, "rewards/rejected": 1.0467791557312012, "step": 6551 }, { "epoch": 3.53, "learning_rate": 3.5179316340834376e-09, "logits/chosen": -2.0478363037109375, "logits/rejected": -2.24249267578125, "logps/chosen": -2.1673731803894043, "logps/rejected": -2.10945725440979, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.8412454724311829, "rewards/margins": 0.010151386260986328, "rewards/rejected": 0.8310940861701965, "step": 6552 }, { "epoch": 3.53, "learning_rate": 3.509889571480662e-09, "logits/chosen": -2.0243241786956787, "logits/rejected": -2.0120463371276855, "logps/chosen": -8.705581665039062, "logps/rejected": -8.158564567565918, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 2.236077070236206, "rewards/margins": 1.5152192115783691, "rewards/rejected": 0.7208579182624817, "step": 6553 }, { "epoch": 3.54, "learning_rate": 3.5018563771742006e-09, "logits/chosen": -2.01655912399292, "logits/rejected": -2.2776083946228027, "logps/chosen": -0.810842752456665, "logps/rejected": -0.708311915397644, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 1.0404839515686035, "rewards/margins": -0.005864500999450684, "rewards/rejected": 1.0463484525680542, "step": 6554 }, { "epoch": 3.54, "learning_rate": 3.4938320526964383e-09, "logits/chosen": -2.323568105697632, "logits/rejected": -2.3656742572784424, "logps/chosen": -0.714962363243103, "logps/rejected": -0.6995288133621216, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.7725331783294678, "rewards/margins": 0.008902490139007568, "rewards/rejected": 0.7636306881904602, "step": 6555 }, { "epoch": 3.54, "learning_rate": 3.4858165995780676e-09, "logits/chosen": -1.986456036567688, "logits/rejected": -1.9902127981185913, "logps/chosen": -0.5516820549964905, "logps/rejected": -4.574207305908203, "loss": 0.5747, "rewards/accuracies": 1.0, "rewards/chosen": 1.2400588989257812, "rewards/margins": 0.2528942823410034, "rewards/rejected": 0.9871646165847778, "step": 6556 }, { "epoch": 3.54, "learning_rate": 3.4778100193480875e-09, "logits/chosen": -2.0339081287384033, "logits/rejected": -2.0244951248168945, "logps/chosen": -5.983473777770996, "logps/rejected": -3.313319206237793, "loss": 0.5748, "rewards/accuracies": 1.0, "rewards/chosen": 1.1689404249191284, "rewards/margins": 0.2527086138725281, "rewards/rejected": 0.9162318110466003, "step": 6557 }, { "epoch": 3.54, "learning_rate": 3.4698123135338043e-09, "logits/chosen": -2.0815532207489014, "logits/rejected": -2.361844539642334, "logps/chosen": -0.2587754726409912, "logps/rejected": -0.2476980984210968, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 0.9298557639122009, "rewards/margins": 0.04500579833984375, "rewards/rejected": 0.8848499655723572, "step": 6558 }, { "epoch": 3.54, "learning_rate": 3.461823483660842e-09, "logits/chosen": -2.062481164932251, "logits/rejected": -2.0698416233062744, "logps/chosen": -0.765622615814209, "logps/rejected": -3.084542989730835, "loss": 0.4913, "rewards/accuracies": 1.0, "rewards/chosen": 1.0948961973190308, "rewards/margins": 0.4550577998161316, "rewards/rejected": 0.6398383975028992, "step": 6559 }, { "epoch": 3.54, "learning_rate": 3.45384353125312e-09, "logits/chosen": -2.1528773307800293, "logits/rejected": -2.293065071105957, "logps/chosen": -0.6197414398193359, "logps/rejected": -1.2791295051574707, "loss": 0.6353, "rewards/accuracies": 1.0, "rewards/chosen": 0.9552165865898132, "rewards/margins": 0.11923366785049438, "rewards/rejected": 0.8359829187393188, "step": 6560 }, { "epoch": 3.54, "learning_rate": 3.445872457832877e-09, "logits/chosen": -2.019460916519165, "logits/rejected": -2.3100218772888184, "logps/chosen": -0.2613629698753357, "logps/rejected": -0.2585635781288147, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9955710768699646, "rewards/margins": 0.019153356552124023, "rewards/rejected": 0.9764177203178406, "step": 6561 }, { "epoch": 3.54, "learning_rate": 3.4379102649206238e-09, "logits/chosen": -2.139230728149414, "logits/rejected": -2.1360034942626953, "logps/chosen": -0.6577126979827881, "logps/rejected": -6.666719913482666, "loss": 0.4876, "rewards/accuracies": 1.0, "rewards/chosen": 1.083916187286377, "rewards/margins": 0.4646748900413513, "rewards/rejected": 0.6192412972450256, "step": 6562 }, { "epoch": 3.54, "learning_rate": 3.4299569540352123e-09, "logits/chosen": -2.1193788051605225, "logits/rejected": -2.3587560653686523, "logps/chosen": -0.837608277797699, "logps/rejected": -0.9145257472991943, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 1.2076338529586792, "rewards/margins": 0.017542600631713867, "rewards/rejected": 1.1900912523269653, "step": 6563 }, { "epoch": 3.54, "learning_rate": 3.422012526693796e-09, "logits/chosen": -2.1309146881103516, "logits/rejected": -2.3671345710754395, "logps/chosen": -0.4125024378299713, "logps/rejected": -20.316673278808594, "loss": 0.6614, "rewards/accuracies": 1.0, "rewards/chosen": 1.9164154529571533, "rewards/margins": 0.06454122066497803, "rewards/rejected": 1.8518742322921753, "step": 6564 }, { "epoch": 3.54, "learning_rate": 3.414076984411818e-09, "logits/chosen": -2.1754839420318604, "logits/rejected": -2.2740516662597656, "logps/chosen": -0.5906766653060913, "logps/rejected": -0.6052639484405518, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 1.0793588161468506, "rewards/margins": 0.008886218070983887, "rewards/rejected": 1.0704725980758667, "step": 6565 }, { "epoch": 3.54, "learning_rate": 3.406150328703039e-09, "logits/chosen": -2.1345534324645996, "logits/rejected": -2.3990628719329834, "logps/chosen": -0.58369380235672, "logps/rejected": -9.934149742126465, "loss": 0.669, "rewards/accuracies": 1.0, "rewards/chosen": 1.0595136880874634, "rewards/margins": 0.0489579439163208, "rewards/rejected": 1.0105557441711426, "step": 6566 }, { "epoch": 3.54, "learning_rate": 3.398232561079523e-09, "logits/chosen": -2.153325319290161, "logits/rejected": -2.1555373668670654, "logps/chosen": -2.567466974258423, "logps/rejected": -3.750281572341919, "loss": 0.5453, "rewards/accuracies": 1.0, "rewards/chosen": 1.0948275327682495, "rewards/margins": 0.3214873671531677, "rewards/rejected": 0.7733401656150818, "step": 6567 }, { "epoch": 3.54, "learning_rate": 3.390323683051638e-09, "logits/chosen": -2.1246230602264404, "logits/rejected": -2.0897715091705322, "logps/chosen": -5.249063491821289, "logps/rejected": -5.190812587738037, "loss": 0.3983, "rewards/accuracies": 1.0, "rewards/chosen": 1.2090857028961182, "rewards/margins": 0.7148187160491943, "rewards/rejected": 0.49426695704460144, "step": 6568 }, { "epoch": 3.54, "learning_rate": 3.3824236961280394e-09, "logits/chosen": -2.0598316192626953, "logits/rejected": -2.059331178665161, "logps/chosen": -2.205213785171509, "logps/rejected": -0.584486186504364, "loss": 0.6327, "rewards/accuracies": 1.0, "rewards/chosen": 0.8639523386955261, "rewards/margins": 0.12473738193511963, "rewards/rejected": 0.7392149567604065, "step": 6569 }, { "epoch": 3.54, "learning_rate": 3.3745326018157218e-09, "logits/chosen": -2.1691527366638184, "logits/rejected": -2.1500766277313232, "logps/chosen": -13.513243675231934, "logps/rejected": -3.9562489986419678, "loss": 0.2882, "rewards/accuracies": 1.0, "rewards/chosen": 1.5168099403381348, "rewards/margins": 1.0964585542678833, "rewards/rejected": 0.42035141587257385, "step": 6570 }, { "epoch": 3.54, "learning_rate": 3.3666504016199582e-09, "logits/chosen": -2.050504446029663, "logits/rejected": -2.2866439819335938, "logps/chosen": -0.36441153287887573, "logps/rejected": -0.34934595227241516, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.875307023525238, "rewards/margins": 0.007265806198120117, "rewards/rejected": 0.8680412173271179, "step": 6571 }, { "epoch": 3.54, "learning_rate": 3.3587770970443296e-09, "logits/chosen": -2.1604316234588623, "logits/rejected": -2.3110454082489014, "logps/chosen": -20.236499786376953, "logps/rejected": -7.789732933044434, "loss": 0.4258, "rewards/accuracies": 1.0, "rewards/chosen": 1.590065360069275, "rewards/margins": 0.6333193182945251, "rewards/rejected": 0.9567460417747498, "step": 6572 }, { "epoch": 3.55, "learning_rate": 3.3509126895907182e-09, "logits/chosen": -2.1495749950408936, "logits/rejected": -2.283344268798828, "logps/chosen": -5.20671272277832, "logps/rejected": -5.290666580200195, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.6556860208511353, "rewards/margins": 0.034616172313690186, "rewards/rejected": 0.6210698485374451, "step": 6573 }, { "epoch": 3.55, "learning_rate": 3.343057180759312e-09, "logits/chosen": -2.0997700691223145, "logits/rejected": -2.164494752883911, "logps/chosen": -2.6095356941223145, "logps/rejected": -13.930155754089355, "loss": 0.3648, "rewards/accuracies": 1.0, "rewards/chosen": 1.667047142982483, "rewards/margins": 0.8203961253166199, "rewards/rejected": 0.846651017665863, "step": 6574 }, { "epoch": 3.55, "learning_rate": 3.3352105720486025e-09, "logits/chosen": -2.070435047149658, "logits/rejected": -2.354077100753784, "logps/chosen": -0.3021925687789917, "logps/rejected": -0.3687743544578552, "loss": 0.7038, "rewards/accuracies": 0.0, "rewards/chosen": 0.7919650077819824, "rewards/margins": -0.021282494068145752, "rewards/rejected": 0.8132475018501282, "step": 6575 }, { "epoch": 3.55, "learning_rate": 3.3273728649553857e-09, "logits/chosen": -2.1737401485443115, "logits/rejected": -2.288656711578369, "logps/chosen": -2.7134523391723633, "logps/rejected": -2.6780619621276855, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 1.1206964254379272, "rewards/margins": 0.015971660614013672, "rewards/rejected": 1.1047247648239136, "step": 6576 }, { "epoch": 3.55, "learning_rate": 3.3195440609747494e-09, "logits/chosen": -2.0082521438598633, "logits/rejected": -2.246281623840332, "logps/chosen": -0.9278354644775391, "logps/rejected": -1.004380464553833, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.9123829007148743, "rewards/margins": 0.0002358555793762207, "rewards/rejected": 0.912147045135498, "step": 6577 }, { "epoch": 3.55, "learning_rate": 3.3117241616001045e-09, "logits/chosen": -2.214508056640625, "logits/rejected": -2.209315299987793, "logps/chosen": -5.657094955444336, "logps/rejected": -6.242758750915527, "loss": 0.3699, "rewards/accuracies": 1.0, "rewards/chosen": 1.2683368921279907, "rewards/margins": 0.8039414882659912, "rewards/rejected": 0.4643954336643219, "step": 6578 }, { "epoch": 3.55, "learning_rate": 3.3039131683231302e-09, "logits/chosen": -2.115762710571289, "logits/rejected": -2.288620948791504, "logps/chosen": -3.2778279781341553, "logps/rejected": -3.135732650756836, "loss": 0.722, "rewards/accuracies": 0.0, "rewards/chosen": 0.7548984885215759, "rewards/margins": -0.05681252479553223, "rewards/rejected": 0.8117110133171082, "step": 6579 }, { "epoch": 3.55, "learning_rate": 3.2961110826338233e-09, "logits/chosen": -1.9593613147735596, "logits/rejected": -1.9517744779586792, "logps/chosen": -3.4339981079101562, "logps/rejected": -4.755008697509766, "loss": 0.2396, "rewards/accuracies": 1.0, "rewards/chosen": 1.638784408569336, "rewards/margins": 1.3067405223846436, "rewards/rejected": 0.33204385638237, "step": 6580 }, { "epoch": 3.55, "learning_rate": 3.288317906020499e-09, "logits/chosen": -2.05155873298645, "logits/rejected": -2.038938283920288, "logps/chosen": -5.213647842407227, "logps/rejected": -1.136056661605835, "loss": 0.3384, "rewards/accuracies": 1.0, "rewards/chosen": 1.6994974613189697, "rewards/margins": 0.909720242023468, "rewards/rejected": 0.7897772192955017, "step": 6581 }, { "epoch": 3.55, "learning_rate": 3.2805336399697514e-09, "logits/chosen": -2.082423686981201, "logits/rejected": -2.0712673664093018, "logps/chosen": -3.493330717086792, "logps/rejected": -3.0986530780792236, "loss": 0.338, "rewards/accuracies": 1.0, "rewards/chosen": 1.6914571523666382, "rewards/margins": 0.9109985828399658, "rewards/rejected": 0.7804585695266724, "step": 6582 }, { "epoch": 3.55, "learning_rate": 3.2727582859664814e-09, "logits/chosen": -2.0492799282073975, "logits/rejected": -2.056942939758301, "logps/chosen": -4.682176113128662, "logps/rejected": -3.789938449859619, "loss": 0.3685, "rewards/accuracies": 1.0, "rewards/chosen": 1.426730990409851, "rewards/margins": 0.8084004521369934, "rewards/rejected": 0.6183305382728577, "step": 6583 }, { "epoch": 3.55, "learning_rate": 3.264991845493886e-09, "logits/chosen": -2.155644655227661, "logits/rejected": -2.140752077102661, "logps/chosen": -6.99061393737793, "logps/rejected": -6.580570220947266, "loss": 0.3132, "rewards/accuracies": 1.0, "rewards/chosen": 1.5415468215942383, "rewards/margins": 1.0004007816314697, "rewards/rejected": 0.5411460995674133, "step": 6584 }, { "epoch": 3.55, "learning_rate": 3.257234320033464e-09, "logits/chosen": -2.0876994132995605, "logits/rejected": -2.0696187019348145, "logps/chosen": -15.28615951538086, "logps/rejected": -4.920389652252197, "loss": 0.3529, "rewards/accuracies": 1.0, "rewards/chosen": 1.7559894323349, "rewards/margins": 0.8598884344100952, "rewards/rejected": 0.8961009979248047, "step": 6585 }, { "epoch": 3.55, "learning_rate": 3.2494857110650197e-09, "logits/chosen": -2.1156225204467773, "logits/rejected": -2.3352725505828857, "logps/chosen": -1.063542127609253, "logps/rejected": -0.9627354145050049, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 1.0328611135482788, "rewards/margins": 0.04387444257736206, "rewards/rejected": 0.9889866709709167, "step": 6586 }, { "epoch": 3.55, "learning_rate": 3.2417460200666546e-09, "logits/chosen": -2.0016884803771973, "logits/rejected": -1.95707368850708, "logps/chosen": -9.403231620788574, "logps/rejected": -1.6289819478988647, "loss": 0.4353, "rewards/accuracies": 1.0, "rewards/chosen": 1.5564948320388794, "rewards/margins": 0.6063045859336853, "rewards/rejected": 0.9501902461051941, "step": 6587 }, { "epoch": 3.55, "learning_rate": 3.2340152485147543e-09, "logits/chosen": -2.055974006652832, "logits/rejected": -2.0580456256866455, "logps/chosen": -2.08376407623291, "logps/rejected": -0.2612948417663574, "loss": 0.6574, "rewards/accuracies": 1.0, "rewards/chosen": 0.8292733430862427, "rewards/margins": 0.0727803111076355, "rewards/rejected": 0.7564930319786072, "step": 6588 }, { "epoch": 3.55, "learning_rate": 3.226293397884028e-09, "logits/chosen": -2.047416925430298, "logits/rejected": -2.324930429458618, "logps/chosen": -0.7679994106292725, "logps/rejected": -0.6815864443778992, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 1.0059150457382202, "rewards/margins": 0.0010097026824951172, "rewards/rejected": 1.004905343055725, "step": 6589 }, { "epoch": 3.55, "learning_rate": 3.218580469647458e-09, "logits/chosen": -2.0719807147979736, "logits/rejected": -2.061213731765747, "logps/chosen": -2.661248207092285, "logps/rejected": -6.842622756958008, "loss": 0.2983, "rewards/accuracies": 1.0, "rewards/chosen": 1.3569639921188354, "rewards/margins": 1.0567063093185425, "rewards/rejected": 0.30025768280029297, "step": 6590 }, { "epoch": 3.56, "learning_rate": 3.2108764652763464e-09, "logits/chosen": -2.1011252403259277, "logits/rejected": -2.3245952129364014, "logps/chosen": -2.9030346870422363, "logps/rejected": -2.877902030944824, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 1.4645702838897705, "rewards/margins": -0.00015306472778320312, "rewards/rejected": 1.4647233486175537, "step": 6591 }, { "epoch": 3.56, "learning_rate": 3.2031813862402725e-09, "logits/chosen": -2.232119083404541, "logits/rejected": -2.2601468563079834, "logps/chosen": -15.47348690032959, "logps/rejected": -12.766237258911133, "loss": 0.4222, "rewards/accuracies": 1.0, "rewards/chosen": 2.0554490089416504, "rewards/margins": 0.6437865495681763, "rewards/rejected": 1.4116624593734741, "step": 6592 }, { "epoch": 3.56, "learning_rate": 3.195495234007134e-09, "logits/chosen": -2.173492431640625, "logits/rejected": -2.324505567550659, "logps/chosen": -13.137701988220215, "logps/rejected": -8.866320610046387, "loss": 0.4901, "rewards/accuracies": 1.0, "rewards/chosen": 1.3167827129364014, "rewards/margins": 0.45820051431655884, "rewards/rejected": 0.8585821986198425, "step": 6593 }, { "epoch": 3.56, "learning_rate": 3.1878180100431084e-09, "logits/chosen": -2.0525007247924805, "logits/rejected": -2.0451133251190186, "logps/chosen": -2.535506010055542, "logps/rejected": -3.7419002056121826, "loss": 0.541, "rewards/accuracies": 1.0, "rewards/chosen": 1.0035640001296997, "rewards/margins": 0.3316773772239685, "rewards/rejected": 0.6718866229057312, "step": 6594 }, { "epoch": 3.56, "learning_rate": 3.18014971581268e-09, "logits/chosen": -2.0590157508850098, "logits/rejected": -2.3043525218963623, "logps/chosen": -0.2228410542011261, "logps/rejected": -0.20328804850578308, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.8332859873771667, "rewards/margins": 0.028773963451385498, "rewards/rejected": 0.8045120239257812, "step": 6595 }, { "epoch": 3.56, "learning_rate": 3.1724903527786227e-09, "logits/chosen": -2.1134378910064697, "logits/rejected": -2.2676141262054443, "logps/chosen": -1.8236790895462036, "logps/rejected": -0.5725370049476624, "loss": 0.6681, "rewards/accuracies": 1.0, "rewards/chosen": 0.9151850938796997, "rewards/margins": 0.050812721252441406, "rewards/rejected": 0.8643723726272583, "step": 6596 }, { "epoch": 3.56, "learning_rate": 3.1648399224020184e-09, "logits/chosen": -2.068526029586792, "logits/rejected": -2.2487192153930664, "logps/chosen": -1.4618844985961914, "logps/rejected": -1.2759137153625488, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.7490631341934204, "rewards/margins": 0.0064664483070373535, "rewards/rejected": 0.7425966858863831, "step": 6597 }, { "epoch": 3.56, "learning_rate": 3.157198426142227e-09, "logits/chosen": -2.1779465675354004, "logits/rejected": -2.072465419769287, "logps/chosen": -25.77726936340332, "logps/rejected": -6.74827241897583, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": 2.5811095237731934, "rewards/margins": 2.087871789932251, "rewards/rejected": 0.4932376444339752, "step": 6598 }, { "epoch": 3.56, "learning_rate": 3.1495658654569267e-09, "logits/chosen": -2.1439297199249268, "logits/rejected": -2.1351161003112793, "logps/chosen": -12.654550552368164, "logps/rejected": -13.939618110656738, "loss": 0.3117, "rewards/accuracies": 1.0, "rewards/chosen": 1.487526774406433, "rewards/margins": 1.005964994430542, "rewards/rejected": 0.4815617501735687, "step": 6599 }, { "epoch": 3.56, "learning_rate": 3.14194224180207e-09, "logits/chosen": -2.036571502685547, "logits/rejected": -2.259253740310669, "logps/chosen": -1.6685168743133545, "logps/rejected": -1.5288472175598145, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9712496995925903, "rewards/margins": 0.016872704029083252, "rewards/rejected": 0.9543769955635071, "step": 6600 }, { "epoch": 3.56, "learning_rate": 3.134327556631916e-09, "logits/chosen": -2.1745615005493164, "logits/rejected": -2.330263376235962, "logps/chosen": -1.5517323017120361, "logps/rejected": -1.8733327388763428, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": 0.7909890413284302, "rewards/margins": 0.04449462890625, "rewards/rejected": 0.7464944124221802, "step": 6601 }, { "epoch": 3.56, "learning_rate": 3.126721811399019e-09, "logits/chosen": -2.1601450443267822, "logits/rejected": -2.327033519744873, "logps/chosen": -2.5935001373291016, "logps/rejected": -0.48570534586906433, "loss": 0.6489, "rewards/accuracies": 1.0, "rewards/chosen": 1.0652518272399902, "rewards/margins": 0.09045487642288208, "rewards/rejected": 0.9747969508171082, "step": 6602 }, { "epoch": 3.56, "learning_rate": 3.119125007554213e-09, "logits/chosen": -1.9752568006515503, "logits/rejected": -1.9755116701126099, "logps/chosen": -2.7198729515075684, "logps/rejected": -0.4455070495605469, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.793339192867279, "rewards/margins": 0.0011518001556396484, "rewards/rejected": 0.7921873927116394, "step": 6603 }, { "epoch": 3.56, "learning_rate": 3.1115371465466555e-09, "logits/chosen": -1.9946691989898682, "logits/rejected": -1.9929755926132202, "logps/chosen": -2.619274616241455, "logps/rejected": -4.170979976654053, "loss": 0.3555, "rewards/accuracies": 1.0, "rewards/chosen": 1.3989925384521484, "rewards/margins": 0.8511297702789307, "rewards/rejected": 0.5478627681732178, "step": 6604 }, { "epoch": 3.56, "learning_rate": 3.103958229823772e-09, "logits/chosen": -2.057192087173462, "logits/rejected": -2.274872303009033, "logps/chosen": -1.2509263753890991, "logps/rejected": -1.092920184135437, "loss": 0.5806, "rewards/accuracies": 1.0, "rewards/chosen": 1.1444077491760254, "rewards/margins": 0.23939359188079834, "rewards/rejected": 0.905014157295227, "step": 6605 }, { "epoch": 3.56, "learning_rate": 3.096388258831295e-09, "logits/chosen": -2.1766164302825928, "logits/rejected": -2.3262808322906494, "logps/chosen": -0.9022262692451477, "logps/rejected": -0.9308198094367981, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 1.075767159461975, "rewards/margins": -0.01641261577606201, "rewards/rejected": 1.092179775238037, "step": 6606 }, { "epoch": 3.56, "learning_rate": 3.088827235013247e-09, "logits/chosen": -2.085589647293091, "logits/rejected": -2.2295150756835938, "logps/chosen": -0.34024763107299805, "logps/rejected": -0.36252376437187195, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8663083910942078, "rewards/margins": 0.02101188898086548, "rewards/rejected": 0.8452965021133423, "step": 6607 }, { "epoch": 3.56, "learning_rate": 3.081275159811947e-09, "logits/chosen": -2.08125901222229, "logits/rejected": -2.2823004722595215, "logps/chosen": -0.8449500799179077, "logps/rejected": -0.8047885894775391, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 0.8503408432006836, "rewards/margins": 0.02520751953125, "rewards/rejected": 0.8251333236694336, "step": 6608 }, { "epoch": 3.56, "learning_rate": 3.073732034667992e-09, "logits/chosen": -2.0725982189178467, "logits/rejected": -2.2459585666656494, "logps/chosen": -0.560403048992157, "logps/rejected": -0.6468016505241394, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 1.0620906352996826, "rewards/margins": 0.0067958831787109375, "rewards/rejected": 1.0552947521209717, "step": 6609 }, { "epoch": 3.57, "learning_rate": 3.0661978610202868e-09, "logits/chosen": -2.0444202423095703, "logits/rejected": -2.303065061569214, "logps/chosen": -5.179130554199219, "logps/rejected": -5.135265827178955, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.3814467489719391, "rewards/margins": 0.030216842889785767, "rewards/rejected": 0.3512299060821533, "step": 6610 }, { "epoch": 3.57, "learning_rate": 3.0586726403060203e-09, "logits/chosen": -2.170776128768921, "logits/rejected": -2.171128511428833, "logps/chosen": -0.10601936280727386, "logps/rejected": -4.0361151695251465, "loss": 0.4509, "rewards/accuracies": 1.0, "rewards/chosen": 1.0347301959991455, "rewards/margins": 0.562535285949707, "rewards/rejected": 0.4721949100494385, "step": 6611 }, { "epoch": 3.57, "learning_rate": 3.051156373960695e-09, "logits/chosen": -2.149935007095337, "logits/rejected": -2.033590078353882, "logps/chosen": -21.069814682006836, "logps/rejected": -3.129765748977661, "loss": 0.1353, "rewards/accuracies": 1.0, "rewards/chosen": 2.529306173324585, "rewards/margins": 1.931948184967041, "rewards/rejected": 0.5973580479621887, "step": 6612 }, { "epoch": 3.57, "learning_rate": 3.043649063418069e-09, "logits/chosen": -2.146091938018799, "logits/rejected": -2.139282464981079, "logps/chosen": -2.6381969451904297, "logps/rejected": -2.776855230331421, "loss": 0.4153, "rewards/accuracies": 1.0, "rewards/chosen": 1.3295215368270874, "rewards/margins": 0.6638816595077515, "rewards/rejected": 0.6656398773193359, "step": 6613 }, { "epoch": 3.57, "learning_rate": 3.036150710110219e-09, "logits/chosen": -2.0922939777374268, "logits/rejected": -2.077484607696533, "logps/chosen": -10.615299224853516, "logps/rejected": -4.645587921142578, "loss": 0.2626, "rewards/accuracies": 1.0, "rewards/chosen": 1.6661614179611206, "rewards/margins": 1.202779769897461, "rewards/rejected": 0.46338167786598206, "step": 6614 }, { "epoch": 3.57, "learning_rate": 3.028661315467512e-09, "logits/chosen": -2.176318883895874, "logits/rejected": -2.3052546977996826, "logps/chosen": -18.301620483398438, "logps/rejected": -1.694452166557312, "loss": 0.641, "rewards/accuracies": 1.0, "rewards/chosen": 1.14910089969635, "rewards/margins": 0.10722482204437256, "rewards/rejected": 1.0418760776519775, "step": 6615 }, { "epoch": 3.57, "learning_rate": 3.0211808809185947e-09, "logits/chosen": -2.041675567626953, "logits/rejected": -2.034053087234497, "logps/chosen": -12.485063552856445, "logps/rejected": -5.767412185668945, "loss": 0.3113, "rewards/accuracies": 1.0, "rewards/chosen": 1.7627285718917847, "rewards/margins": 1.0074741840362549, "rewards/rejected": 0.7552544474601746, "step": 6616 }, { "epoch": 3.57, "learning_rate": 3.0137094078904078e-09, "logits/chosen": -2.022854804992676, "logits/rejected": -2.0329489707946777, "logps/chosen": -1.411967396736145, "logps/rejected": -1.831769585609436, "loss": 0.4688, "rewards/accuracies": 1.0, "rewards/chosen": 1.1973518133163452, "rewards/margins": 0.5141132473945618, "rewards/rejected": 0.6832385659217834, "step": 6617 }, { "epoch": 3.57, "learning_rate": 3.0062468978081847e-09, "logits/chosen": -2.130382776260376, "logits/rejected": -2.096449375152588, "logps/chosen": -9.99240493774414, "logps/rejected": -2.3621551990509033, "loss": 0.259, "rewards/accuracies": 1.0, "rewards/chosen": 1.8604812622070312, "rewards/margins": 1.2185449600219727, "rewards/rejected": 0.6419363021850586, "step": 6618 }, { "epoch": 3.57, "learning_rate": 2.9987933520954524e-09, "logits/chosen": -2.2091872692108154, "logits/rejected": -2.2166748046875, "logps/chosen": -2.9225120544433594, "logps/rejected": -6.890568256378174, "loss": 0.4064, "rewards/accuracies": 1.0, "rewards/chosen": 0.8297433257102966, "rewards/margins": 0.6902152895927429, "rewards/rejected": 0.1395280361175537, "step": 6619 }, { "epoch": 3.57, "learning_rate": 2.991348772174024e-09, "logits/chosen": -2.1042330265045166, "logits/rejected": -2.105572462081909, "logps/chosen": -2.2165367603302, "logps/rejected": -4.774256229400635, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 1.569961428642273, "rewards/margins": 1.1247161626815796, "rewards/rejected": 0.44524523615837097, "step": 6620 }, { "epoch": 3.57, "learning_rate": 2.9839131594639964e-09, "logits/chosen": -2.0824227333068848, "logits/rejected": -2.0800845623016357, "logps/chosen": -5.987867832183838, "logps/rejected": -5.964656829833984, "loss": 0.6045, "rewards/accuracies": 1.0, "rewards/chosen": 1.2309614419937134, "rewards/margins": 0.18600523471832275, "rewards/rejected": 1.0449562072753906, "step": 6621 }, { "epoch": 3.57, "learning_rate": 2.9764865153837636e-09, "logits/chosen": -2.0332255363464355, "logits/rejected": -2.278651475906372, "logps/chosen": -0.45243945717811584, "logps/rejected": -0.4894290268421173, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9678906798362732, "rewards/margins": 0.02218639850616455, "rewards/rejected": 0.9457042813301086, "step": 6622 }, { "epoch": 3.57, "learning_rate": 2.9690688413500086e-09, "logits/chosen": -2.1658687591552734, "logits/rejected": -2.170370101928711, "logps/chosen": -2.7415382862091064, "logps/rejected": -1.835886836051941, "loss": 0.6414, "rewards/accuracies": 1.0, "rewards/chosen": 0.9763109087944031, "rewards/margins": 0.10640358924865723, "rewards/rejected": 0.8699073195457458, "step": 6623 }, { "epoch": 3.57, "learning_rate": 2.961660138777705e-09, "logits/chosen": -1.958876371383667, "logits/rejected": -2.269869327545166, "logps/chosen": -0.21995703876018524, "logps/rejected": -0.2162645310163498, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 0.8630287051200867, "rewards/margins": 0.03565025329589844, "rewards/rejected": 0.8273784518241882, "step": 6624 }, { "epoch": 3.57, "learning_rate": 2.954260409080106e-09, "logits/chosen": -2.165569305419922, "logits/rejected": -2.155927896499634, "logps/chosen": -3.341625928878784, "logps/rejected": -9.1154146194458, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 1.31283700466156, "rewards/margins": 0.8734803199768066, "rewards/rejected": 0.4393567144870758, "step": 6625 }, { "epoch": 3.57, "learning_rate": 2.9468696536687653e-09, "logits/chosen": -2.1664795875549316, "logits/rejected": -2.2722792625427246, "logps/chosen": -0.7043949365615845, "logps/rejected": -1.9351747035980225, "loss": 0.6616, "rewards/accuracies": 1.0, "rewards/chosen": 0.9096202254295349, "rewards/margins": 0.06421387195587158, "rewards/rejected": 0.8454063534736633, "step": 6626 }, { "epoch": 3.57, "learning_rate": 2.9394878739535114e-09, "logits/chosen": -2.0402917861938477, "logits/rejected": -2.22522234916687, "logps/chosen": -0.2703034579753876, "logps/rejected": -0.26416313648223877, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.8878952860832214, "rewards/margins": 0.005049705505371094, "rewards/rejected": 0.8828455805778503, "step": 6627 }, { "epoch": 3.57, "learning_rate": 2.9321150713424725e-09, "logits/chosen": -2.1691384315490723, "logits/rejected": -2.1714346408843994, "logps/chosen": -2.8053531646728516, "logps/rejected": -6.635346412658691, "loss": 0.3078, "rewards/accuracies": 1.0, "rewards/chosen": 1.2472879886627197, "rewards/margins": 1.0204768180847168, "rewards/rejected": 0.2268112152814865, "step": 6628 }, { "epoch": 3.58, "learning_rate": 2.924751247242063e-09, "logits/chosen": -1.9993592500686646, "logits/rejected": -2.2764763832092285, "logps/chosen": -0.6404716372489929, "logps/rejected": -0.6896597743034363, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 1.0229524374008179, "rewards/margins": 0.01058495044708252, "rewards/rejected": 1.0123674869537354, "step": 6629 }, { "epoch": 3.58, "learning_rate": 2.917396403056971e-09, "logits/chosen": -2.0614428520202637, "logits/rejected": -2.2927751541137695, "logps/chosen": -0.3037659227848053, "logps/rejected": -1.8281798362731934, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 1.0343263149261475, "rewards/margins": 0.19127780199050903, "rewards/rejected": 0.8430485129356384, "step": 6630 }, { "epoch": 3.58, "learning_rate": 2.9100505401901897e-09, "logits/chosen": -2.0356228351593018, "logits/rejected": -2.0779738426208496, "logps/chosen": -0.7434115409851074, "logps/rejected": -14.296760559082031, "loss": 0.6712, "rewards/accuracies": 1.0, "rewards/chosen": 0.8076200485229492, "rewards/margins": 0.04446142911911011, "rewards/rejected": 0.7631586194038391, "step": 6631 }, { "epoch": 3.58, "learning_rate": 2.9027136600429823e-09, "logits/chosen": -2.1840243339538574, "logits/rejected": -2.182516098022461, "logps/chosen": -2.224090814590454, "logps/rejected": -13.624296188354492, "loss": 0.5033, "rewards/accuracies": 1.0, "rewards/chosen": 1.007367491722107, "rewards/margins": 0.42427152395248413, "rewards/rejected": 0.5830959677696228, "step": 6632 }, { "epoch": 3.58, "learning_rate": 2.8953857640149184e-09, "logits/chosen": -2.0689873695373535, "logits/rejected": -2.0548274517059326, "logps/chosen": -17.731557846069336, "logps/rejected": -1.262634038925171, "loss": 0.1963, "rewards/accuracies": 1.0, "rewards/chosen": 2.3572185039520264, "rewards/margins": 1.5286256074905396, "rewards/rejected": 0.8285928964614868, "step": 6633 }, { "epoch": 3.58, "learning_rate": 2.88806685350384e-09, "logits/chosen": -2.1289103031158447, "logits/rejected": -2.292623519897461, "logps/chosen": -3.22462797164917, "logps/rejected": -4.169441223144531, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.7749043107032776, "rewards/margins": 0.015274405479431152, "rewards/rejected": 0.7596299052238464, "step": 6634 }, { "epoch": 3.58, "learning_rate": 2.88075692990587e-09, "logits/chosen": -1.9707813262939453, "logits/rejected": -1.9585233926773071, "logps/chosen": -0.5480630993843079, "logps/rejected": -4.648586273193359, "loss": 0.4691, "rewards/accuracies": 1.0, "rewards/chosen": 1.1674085855484009, "rewards/margins": 0.5132813453674316, "rewards/rejected": 0.6541272401809692, "step": 6635 }, { "epoch": 3.58, "learning_rate": 2.873455994615437e-09, "logits/chosen": -1.9927846193313599, "logits/rejected": -2.240598440170288, "logps/chosen": -0.15024960041046143, "logps/rejected": -0.16761647164821625, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.846265971660614, "rewards/margins": 0.023774802684783936, "rewards/rejected": 0.8224911689758301, "step": 6636 }, { "epoch": 3.58, "learning_rate": 2.8661640490252272e-09, "logits/chosen": -2.0700764656066895, "logits/rejected": -2.0750370025634766, "logps/chosen": -0.536107063293457, "logps/rejected": -2.7015559673309326, "loss": 0.4735, "rewards/accuracies": 1.0, "rewards/chosen": 1.0822335481643677, "rewards/margins": 0.5014271140098572, "rewards/rejected": 0.5808064341545105, "step": 6637 }, { "epoch": 3.58, "learning_rate": 2.8588810945262443e-09, "logits/chosen": -2.1569483280181885, "logits/rejected": -2.25707745552063, "logps/chosen": -0.28017938137054443, "logps/rejected": -0.27832746505737305, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.8031376004219055, "rewards/margins": 0.013794362545013428, "rewards/rejected": 0.7893432378768921, "step": 6638 }, { "epoch": 3.58, "learning_rate": 2.851607132507744e-09, "logits/chosen": -1.9834080934524536, "logits/rejected": -2.2583537101745605, "logps/chosen": -0.22535961866378784, "logps/rejected": -0.15548895299434662, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9101857542991638, "rewards/margins": 0.04992115497589111, "rewards/rejected": 0.8602645993232727, "step": 6639 }, { "epoch": 3.58, "learning_rate": 2.8443421643572994e-09, "logits/chosen": -2.029801607131958, "logits/rejected": -2.25148344039917, "logps/chosen": -0.1969950944185257, "logps/rejected": -0.2242773473262787, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.8365212678909302, "rewards/margins": 0.02345287799835205, "rewards/rejected": 0.8130683898925781, "step": 6640 }, { "epoch": 3.58, "learning_rate": 2.837086191460736e-09, "logits/chosen": -2.1087703704833984, "logits/rejected": -2.304727792739868, "logps/chosen": -0.14581695199012756, "logps/rejected": -0.16295476257801056, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.819586455821991, "rewards/margins": 0.006067991256713867, "rewards/rejected": 0.8135184645652771, "step": 6641 }, { "epoch": 3.58, "learning_rate": 2.829839215202184e-09, "logits/chosen": -2.0502431392669678, "logits/rejected": -2.302777051925659, "logps/chosen": -2.3190841674804688, "logps/rejected": -0.48166894912719727, "loss": 0.7266, "rewards/accuracies": 0.0, "rewards/chosen": 1.0098602771759033, "rewards/margins": -0.06582081317901611, "rewards/rejected": 1.0756810903549194, "step": 6642 }, { "epoch": 3.58, "learning_rate": 2.822601236964056e-09, "logits/chosen": -2.1791675090789795, "logits/rejected": -2.3633642196655273, "logps/chosen": -0.3050440847873688, "logps/rejected": -0.30558374524116516, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.9876322150230408, "rewards/margins": 0.02590543031692505, "rewards/rejected": 0.9617267847061157, "step": 6643 }, { "epoch": 3.58, "learning_rate": 2.815372258127041e-09, "logits/chosen": -2.251164674758911, "logits/rejected": -2.1133956909179688, "logps/chosen": -29.73889923095703, "logps/rejected": -1.5340580940246582, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": 3.2718169689178467, "rewards/margins": 2.5755980014801025, "rewards/rejected": 0.6962190270423889, "step": 6644 }, { "epoch": 3.58, "learning_rate": 2.8081522800701196e-09, "logits/chosen": -2.0569064617156982, "logits/rejected": -2.3506886959075928, "logps/chosen": -2.2057783603668213, "logps/rejected": -2.079054355621338, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.6856779456138611, "rewards/margins": 0.005598604679107666, "rewards/rejected": 0.6800793409347534, "step": 6645 }, { "epoch": 3.58, "learning_rate": 2.80094130417054e-09, "logits/chosen": -2.1760342121124268, "logits/rejected": -2.176244020462036, "logps/chosen": -2.6877286434173584, "logps/rejected": -4.4906158447265625, "loss": 0.2508, "rewards/accuracies": 1.0, "rewards/chosen": 1.7205528020858765, "rewards/margins": 1.254917860031128, "rewards/rejected": 0.46563491225242615, "step": 6646 }, { "epoch": 3.59, "learning_rate": 2.7937393318038514e-09, "logits/chosen": -2.1617443561553955, "logits/rejected": -2.163430690765381, "logps/chosen": -1.3758466243743896, "logps/rejected": -3.888103485107422, "loss": 0.4901, "rewards/accuracies": 1.0, "rewards/chosen": 1.0236014127731323, "rewards/margins": 0.4582290053367615, "rewards/rejected": 0.5653724074363708, "step": 6647 }, { "epoch": 3.59, "learning_rate": 2.786546364343867e-09, "logits/chosen": -2.190351963043213, "logits/rejected": -2.1967618465423584, "logps/chosen": -2.178262710571289, "logps/rejected": -6.692702293395996, "loss": 0.2759, "rewards/accuracies": 1.0, "rewards/chosen": 1.6390100717544556, "rewards/margins": 1.1464126110076904, "rewards/rejected": 0.49259740114212036, "step": 6648 }, { "epoch": 3.59, "learning_rate": 2.7793624031627103e-09, "logits/chosen": -2.139373540878296, "logits/rejected": -2.1470301151275635, "logps/chosen": -2.8047306537628174, "logps/rejected": -4.93437385559082, "loss": 0.3524, "rewards/accuracies": 1.0, "rewards/chosen": 1.2612193822860718, "rewards/margins": 0.8615366220474243, "rewards/rejected": 0.3996827304363251, "step": 6649 }, { "epoch": 3.59, "learning_rate": 2.772187449630764e-09, "logits/chosen": -2.1304383277893066, "logits/rejected": -2.13181471824646, "logps/chosen": -1.0769424438476562, "logps/rejected": -2.3087351322174072, "loss": 0.492, "rewards/accuracies": 1.0, "rewards/chosen": 1.1049050092697144, "rewards/margins": 0.45326489210128784, "rewards/rejected": 0.6516401171684265, "step": 6650 }, { "epoch": 3.59, "learning_rate": 2.7650215051166937e-09, "logits/chosen": -2.122133493423462, "logits/rejected": -2.096588611602783, "logps/chosen": -12.206839561462402, "logps/rejected": -2.1041626930236816, "loss": 0.3724, "rewards/accuracies": 1.0, "rewards/chosen": 1.4952632188796997, "rewards/margins": 0.7957485914230347, "rewards/rejected": 0.699514627456665, "step": 6651 }, { "epoch": 3.59, "learning_rate": 2.757864570987445e-09, "logits/chosen": -2.224261522293091, "logits/rejected": -2.2405848503112793, "logps/chosen": -2.5942609310150146, "logps/rejected": -10.515534400939941, "loss": 0.4079, "rewards/accuracies": 1.0, "rewards/chosen": 1.4626251459121704, "rewards/margins": 0.6859065294265747, "rewards/rejected": 0.7767186164855957, "step": 6652 }, { "epoch": 3.59, "learning_rate": 2.7507166486082644e-09, "logits/chosen": -2.1033124923706055, "logits/rejected": -2.103468894958496, "logps/chosen": -1.6532796621322632, "logps/rejected": -0.49866560101509094, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": 1.5342170000076294, "rewards/margins": 0.7654509544372559, "rewards/rejected": 0.7687660455703735, "step": 6653 }, { "epoch": 3.59, "learning_rate": 2.743577739342662e-09, "logits/chosen": -2.0522618293762207, "logits/rejected": -2.038022518157959, "logps/chosen": -1.798891305923462, "logps/rejected": -4.338016986846924, "loss": 0.3664, "rewards/accuracies": 1.0, "rewards/chosen": 1.3176628351211548, "rewards/margins": 0.815371572971344, "rewards/rejected": 0.5022912621498108, "step": 6654 }, { "epoch": 3.59, "learning_rate": 2.736447844552425e-09, "logits/chosen": -1.9748358726501465, "logits/rejected": -2.2719171047210693, "logps/chosen": -0.8843767046928406, "logps/rejected": -0.8600506782531738, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.9610967040061951, "rewards/margins": -0.006145000457763672, "rewards/rejected": 0.9672417044639587, "step": 6655 }, { "epoch": 3.59, "learning_rate": 2.7293269655976282e-09, "logits/chosen": -2.117851972579956, "logits/rejected": -2.3787922859191895, "logps/chosen": -14.747772216796875, "logps/rejected": -12.281776428222656, "loss": 0.776, "rewards/accuracies": 0.0, "rewards/chosen": 1.2834968566894531, "rewards/margins": -0.15939044952392578, "rewards/rejected": 1.442887306213379, "step": 6656 }, { "epoch": 3.59, "learning_rate": 2.7222151038366335e-09, "logits/chosen": -2.1909282207489014, "logits/rejected": -2.0826900005340576, "logps/chosen": -22.05498504638672, "logps/rejected": -4.766358375549316, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 2.590672254562378, "rewards/margins": 2.1161811351776123, "rewards/rejected": 0.4744912087917328, "step": 6657 }, { "epoch": 3.59, "learning_rate": 2.715112260626068e-09, "logits/chosen": -2.1477835178375244, "logits/rejected": -2.1403117179870605, "logps/chosen": -5.304534435272217, "logps/rejected": -4.513392448425293, "loss": 0.383, "rewards/accuracies": 1.0, "rewards/chosen": 1.2851065397262573, "rewards/margins": 0.7620568871498108, "rewards/rejected": 0.5230496525764465, "step": 6658 }, { "epoch": 3.59, "learning_rate": 2.7080184373208414e-09, "logits/chosen": -1.941383957862854, "logits/rejected": -1.943997859954834, "logps/chosen": -1.6084883213043213, "logps/rejected": -3.182321548461914, "loss": 0.5103, "rewards/accuracies": 1.0, "rewards/chosen": 1.0383944511413574, "rewards/margins": 0.40674275159835815, "rewards/rejected": 0.6316516995429993, "step": 6659 }, { "epoch": 3.59, "learning_rate": 2.7009336352741664e-09, "logits/chosen": -2.1185896396636963, "logits/rejected": -2.290649890899658, "logps/chosen": -2.071758508682251, "logps/rejected": -2.8607773780822754, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 0.9641649127006531, "rewards/margins": 0.14812332391738892, "rewards/rejected": 0.8160415887832642, "step": 6660 }, { "epoch": 3.59, "learning_rate": 2.6938578558375002e-09, "logits/chosen": -2.020331621170044, "logits/rejected": -2.0137839317321777, "logps/chosen": -3.1542396545410156, "logps/rejected": -3.7618567943573, "loss": 0.3361, "rewards/accuracies": 1.0, "rewards/chosen": 1.497064232826233, "rewards/margins": 0.9177269339561462, "rewards/rejected": 0.5793372988700867, "step": 6661 }, { "epoch": 3.59, "learning_rate": 2.686791100360608e-09, "logits/chosen": -2.056197166442871, "logits/rejected": -2.0499329566955566, "logps/chosen": -1.5288323163986206, "logps/rejected": -6.5571699142456055, "loss": 0.4865, "rewards/accuracies": 1.0, "rewards/chosen": 1.1547948122024536, "rewards/margins": 0.4675178527832031, "rewards/rejected": 0.6872769594192505, "step": 6662 }, { "epoch": 3.59, "learning_rate": 2.679733370191506e-09, "logits/chosen": -2.0401673316955566, "logits/rejected": -2.0421760082244873, "logps/chosen": -0.204893559217453, "logps/rejected": -3.7795822620391846, "loss": 0.5012, "rewards/accuracies": 1.0, "rewards/chosen": 1.0445890426635742, "rewards/margins": 0.42974793910980225, "rewards/rejected": 0.614841103553772, "step": 6663 }, { "epoch": 3.59, "learning_rate": 2.6726846666765114e-09, "logits/chosen": -2.03507137298584, "logits/rejected": -2.02402663230896, "logps/chosen": -11.538185119628906, "logps/rejected": -2.6588456630706787, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 1.0395711660385132, "rewards/margins": 0.19131159782409668, "rewards/rejected": 0.8482595682144165, "step": 6664 }, { "epoch": 3.59, "learning_rate": 2.665644991160204e-09, "logits/chosen": -2.1724953651428223, "logits/rejected": -2.3613014221191406, "logps/chosen": -1.9169361591339111, "logps/rejected": -1.816426396369934, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.6215841174125671, "rewards/margins": 0.0018662214279174805, "rewards/rejected": 0.6197178959846497, "step": 6665 }, { "epoch": 3.6, "learning_rate": 2.658614344985455e-09, "logits/chosen": -2.265610456466675, "logits/rejected": -2.3921597003936768, "logps/chosen": -2.138822078704834, "logps/rejected": -2.2511045932769775, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.7852024435997009, "rewards/margins": 0.01830524206161499, "rewards/rejected": 0.7668972015380859, "step": 6666 }, { "epoch": 3.6, "learning_rate": 2.651592729493407e-09, "logits/chosen": -2.1197543144226074, "logits/rejected": -2.064149856567383, "logps/chosen": -24.828954696655273, "logps/rejected": -4.617809295654297, "loss": 0.1897, "rewards/accuracies": 1.0, "rewards/chosen": 2.0918397903442383, "rewards/margins": 1.5661346912384033, "rewards/rejected": 0.5257050395011902, "step": 6667 }, { "epoch": 3.6, "learning_rate": 2.6445801460234785e-09, "logits/chosen": -2.10310697555542, "logits/rejected": -2.270771026611328, "logps/chosen": -0.6002472639083862, "logps/rejected": -0.6532426476478577, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.6847640872001648, "rewards/margins": 0.011888623237609863, "rewards/rejected": 0.6728754639625549, "step": 6668 }, { "epoch": 3.6, "learning_rate": 2.63757659591336e-09, "logits/chosen": -2.1431503295898438, "logits/rejected": -2.1173150539398193, "logps/chosen": -12.473657608032227, "logps/rejected": -1.0253938436508179, "loss": 0.3265, "rewards/accuracies": 1.0, "rewards/chosen": 1.945621132850647, "rewards/margins": 0.9515215158462524, "rewards/rejected": 0.9940996170043945, "step": 6669 }, { "epoch": 3.6, "learning_rate": 2.630582080499033e-09, "logits/chosen": -2.1236374378204346, "logits/rejected": -2.2393856048583984, "logps/chosen": -16.351924896240234, "logps/rejected": -20.777198791503906, "loss": 0.2757, "rewards/accuracies": 1.0, "rewards/chosen": 1.9094513654708862, "rewards/margins": 1.1475844383239746, "rewards/rejected": 0.7618669867515564, "step": 6670 }, { "epoch": 3.6, "learning_rate": 2.623596601114747e-09, "logits/chosen": -2.2291996479034424, "logits/rejected": -2.093493938446045, "logps/chosen": -29.967737197875977, "logps/rejected": -3.709022045135498, "loss": 0.1847, "rewards/accuracies": 1.0, "rewards/chosen": 2.280480146408081, "rewards/margins": 1.5951007604599, "rewards/rejected": 0.6853793859481812, "step": 6671 }, { "epoch": 3.6, "learning_rate": 2.616620159093025e-09, "logits/chosen": -2.17110276222229, "logits/rejected": -2.225034475326538, "logps/chosen": -5.0230255126953125, "logps/rejected": -7.968853950500488, "loss": 0.5098, "rewards/accuracies": 1.0, "rewards/chosen": 1.325535774230957, "rewards/margins": 0.4079367518424988, "rewards/rejected": 0.9175990223884583, "step": 6672 }, { "epoch": 3.6, "learning_rate": 2.6096527557646685e-09, "logits/chosen": -2.089360475540161, "logits/rejected": -2.315953254699707, "logps/chosen": -0.3042823076248169, "logps/rejected": -0.30776387453079224, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8714489340782166, "rewards/margins": 0.010905861854553223, "rewards/rejected": 0.8605430722236633, "step": 6673 }, { "epoch": 3.6, "learning_rate": 2.602694392458754e-09, "logits/chosen": -2.052814245223999, "logits/rejected": -2.0537571907043457, "logps/chosen": -0.26748526096343994, "logps/rejected": -4.779843807220459, "loss": 0.4568, "rewards/accuracies": 1.0, "rewards/chosen": 1.0754315853118896, "rewards/margins": 0.5463832020759583, "rewards/rejected": 0.5290483832359314, "step": 6674 }, { "epoch": 3.6, "learning_rate": 2.5957450705026475e-09, "logits/chosen": -2.0358386039733887, "logits/rejected": -2.3064286708831787, "logps/chosen": -0.09272359311580658, "logps/rejected": -0.11206235736608505, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8730232119560242, "rewards/margins": 0.021253585815429688, "rewards/rejected": 0.8517696261405945, "step": 6675 }, { "epoch": 3.6, "learning_rate": 2.588804791221966e-09, "logits/chosen": -2.085674285888672, "logits/rejected": -2.270231008529663, "logps/chosen": -0.5216532945632935, "logps/rejected": -0.5732854008674622, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.9181498885154724, "rewards/margins": 0.015021264553070068, "rewards/rejected": 0.9031286239624023, "step": 6676 }, { "epoch": 3.6, "learning_rate": 2.581873555940617e-09, "logits/chosen": -2.162341356277466, "logits/rejected": -2.2846028804779053, "logps/chosen": -3.815035581588745, "logps/rejected": -1.9379920959472656, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": 1.0908093452453613, "rewards/margins": 0.18063771724700928, "rewards/rejected": 0.910171627998352, "step": 6677 }, { "epoch": 3.6, "learning_rate": 2.574951365980782e-09, "logits/chosen": -2.1941819190979004, "logits/rejected": -2.3157103061676025, "logps/chosen": -3.609443187713623, "logps/rejected": -10.75563907623291, "loss": 0.5088, "rewards/accuracies": 1.0, "rewards/chosen": 1.0194990634918213, "rewards/margins": 0.4104693531990051, "rewards/rejected": 0.6090297102928162, "step": 6678 }, { "epoch": 3.6, "learning_rate": 2.5680382226629104e-09, "logits/chosen": -2.056025743484497, "logits/rejected": -2.327620506286621, "logps/chosen": -0.7563071250915527, "logps/rejected": -0.9934837222099304, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.8542869687080383, "rewards/margins": 0.0354771614074707, "rewards/rejected": 0.8188098073005676, "step": 6679 }, { "epoch": 3.6, "learning_rate": 2.5611341273057365e-09, "logits/chosen": -2.2099151611328125, "logits/rejected": -2.219398260116577, "logps/chosen": -2.4863169193267822, "logps/rejected": -5.118016719818115, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 1.3632129430770874, "rewards/margins": 0.9479303359985352, "rewards/rejected": 0.41528257727622986, "step": 6680 }, { "epoch": 3.6, "learning_rate": 2.554239081226245e-09, "logits/chosen": -2.1536076068878174, "logits/rejected": -1.954190969467163, "logps/chosen": -34.22785949707031, "logps/rejected": -3.9741063117980957, "loss": 0.1275, "rewards/accuracies": 1.0, "rewards/chosen": 2.5108933448791504, "rewards/margins": 1.995171070098877, "rewards/rejected": 0.5157222151756287, "step": 6681 }, { "epoch": 3.6, "learning_rate": 2.5473530857397396e-09, "logits/chosen": -2.127516746520996, "logits/rejected": -2.1261043548583984, "logps/chosen": -1.7920092344284058, "logps/rejected": -4.501194000244141, "loss": 0.2725, "rewards/accuracies": 1.0, "rewards/chosen": 1.7129243612289429, "rewards/margins": 1.160916805267334, "rewards/rejected": 0.5520074963569641, "step": 6682 }, { "epoch": 3.6, "learning_rate": 2.5404761421597476e-09, "logits/chosen": -2.214292049407959, "logits/rejected": -2.2230184078216553, "logps/chosen": -1.4736754894256592, "logps/rejected": -6.127973556518555, "loss": 0.2855, "rewards/accuracies": 1.0, "rewards/chosen": 1.3720016479492188, "rewards/margins": 1.1073187589645386, "rewards/rejected": 0.2646828591823578, "step": 6683 }, { "epoch": 3.61, "learning_rate": 2.5336082517981084e-09, "logits/chosen": -1.9685267210006714, "logits/rejected": -1.9818832874298096, "logps/chosen": -5.6908979415893555, "logps/rejected": -4.515145778656006, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 1.5349440574645996, "rewards/margins": 0.8603777289390564, "rewards/rejected": 0.6745663285255432, "step": 6684 }, { "epoch": 3.61, "learning_rate": 2.526749415964902e-09, "logits/chosen": -2.0675435066223145, "logits/rejected": -2.28485107421875, "logps/chosen": -0.32072946429252625, "logps/rejected": -0.32879069447517395, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.842245876789093, "rewards/margins": 0.007885277271270752, "rewards/rejected": 0.8343605995178223, "step": 6685 }, { "epoch": 3.61, "learning_rate": 2.5198996359684975e-09, "logits/chosen": -2.041280508041382, "logits/rejected": -2.309345006942749, "logps/chosen": -1.165766954421997, "logps/rejected": -4.247185230255127, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": 0.9567513465881348, "rewards/margins": 0.24937862157821655, "rewards/rejected": 0.7073727250099182, "step": 6686 }, { "epoch": 3.61, "learning_rate": 2.5130589131155455e-09, "logits/chosen": -2.03083872795105, "logits/rejected": -2.0327799320220947, "logps/chosen": -0.833404541015625, "logps/rejected": -7.67604923248291, "loss": 0.4179, "rewards/accuracies": 1.0, "rewards/chosen": 0.9626978039741516, "rewards/margins": 0.6561627388000488, "rewards/rejected": 0.3065350651741028, "step": 6687 }, { "epoch": 3.61, "learning_rate": 2.5062272487109514e-09, "logits/chosen": -2.1177074909210205, "logits/rejected": -2.2710721492767334, "logps/chosen": -1.8779137134552002, "logps/rejected": -0.5784434080123901, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 1.0544756650924683, "rewards/margins": 0.03736591339111328, "rewards/rejected": 1.017109751701355, "step": 6688 }, { "epoch": 3.61, "learning_rate": 2.4994046440579064e-09, "logits/chosen": -2.078303575515747, "logits/rejected": -2.0262691974639893, "logps/chosen": -10.681418418884277, "logps/rejected": -2.9966800212860107, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 2.080491304397583, "rewards/margins": 1.2870169878005981, "rewards/rejected": 0.7934743165969849, "step": 6689 }, { "epoch": 3.61, "learning_rate": 2.492591100457864e-09, "logits/chosen": -2.065715789794922, "logits/rejected": -2.3446998596191406, "logps/chosen": -1.0066322088241577, "logps/rejected": -1.0938669443130493, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.8065134882926941, "rewards/margins": 0.02549499273300171, "rewards/rejected": 0.7810184955596924, "step": 6690 }, { "epoch": 3.61, "learning_rate": 2.48578661921055e-09, "logits/chosen": -2.1050078868865967, "logits/rejected": -2.1115331649780273, "logps/chosen": -8.107694625854492, "logps/rejected": -2.8260819911956787, "loss": 0.3158, "rewards/accuracies": 1.0, "rewards/chosen": 1.5883344411849976, "rewards/margins": 0.9905602931976318, "rewards/rejected": 0.5977741479873657, "step": 6691 }, { "epoch": 3.61, "learning_rate": 2.4789912016139724e-09, "logits/chosen": -1.9695864915847778, "logits/rejected": -2.259694814682007, "logps/chosen": -1.7072901725769043, "logps/rejected": -1.7948826551437378, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.8004676699638367, "rewards/margins": 0.024561166763305664, "rewards/rejected": 0.775906503200531, "step": 6692 }, { "epoch": 3.61, "learning_rate": 2.4722048489643878e-09, "logits/chosen": -2.015659809112549, "logits/rejected": -2.3160407543182373, "logps/chosen": -0.29268398880958557, "logps/rejected": -0.3085164725780487, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.9891424179077148, "rewards/margins": 0.01541811227798462, "rewards/rejected": 0.9737243056297302, "step": 6693 }, { "epoch": 3.61, "learning_rate": 2.4654275625563493e-09, "logits/chosen": -2.195664644241333, "logits/rejected": -2.3574321269989014, "logps/chosen": -1.3683063983917236, "logps/rejected": -1.5416078567504883, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816312789916992, "rewards/margins": 0.027633070945739746, "rewards/rejected": 0.9539982080459595, "step": 6694 }, { "epoch": 3.61, "learning_rate": 2.458659343682673e-09, "logits/chosen": -2.1562182903289795, "logits/rejected": -2.1564135551452637, "logps/chosen": -0.736717939376831, "logps/rejected": -2.930366039276123, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 1.0476405620574951, "rewards/margins": 0.4262973666191101, "rewards/rejected": 0.621343195438385, "step": 6695 }, { "epoch": 3.61, "learning_rate": 2.451900193634432e-09, "logits/chosen": -2.1700971126556396, "logits/rejected": -2.360062837600708, "logps/chosen": -5.712467670440674, "logps/rejected": -5.401116371154785, "loss": 0.7005, "rewards/accuracies": 0.0, "rewards/chosen": 0.9039308428764343, "rewards/margins": -0.014612674713134766, "rewards/rejected": 0.9185435175895691, "step": 6696 }, { "epoch": 3.61, "learning_rate": 2.4451501137009833e-09, "logits/chosen": -2.197010040283203, "logits/rejected": -2.3504140377044678, "logps/chosen": -9.215346336364746, "logps/rejected": -12.234288215637207, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 1.219505786895752, "rewards/margins": 0.0386502742767334, "rewards/rejected": 1.1808555126190186, "step": 6697 }, { "epoch": 3.61, "learning_rate": 2.438409105169953e-09, "logits/chosen": -2.151771306991577, "logits/rejected": -2.158132553100586, "logps/chosen": -0.5667359232902527, "logps/rejected": -2.159672498703003, "loss": 0.4933, "rewards/accuracies": 1.0, "rewards/chosen": 0.9867746233940125, "rewards/margins": 0.4497981071472168, "rewards/rejected": 0.5369765162467957, "step": 6698 }, { "epoch": 3.61, "learning_rate": 2.431677169327223e-09, "logits/chosen": -2.013972043991089, "logits/rejected": -2.316192150115967, "logps/chosen": -0.4754980504512787, "logps/rejected": -0.5776585936546326, "loss": 0.7086, "rewards/accuracies": 0.0, "rewards/chosen": 0.907811164855957, "rewards/margins": -0.030674397945404053, "rewards/rejected": 0.9384855628013611, "step": 6699 }, { "epoch": 3.61, "learning_rate": 2.4249543074569614e-09, "logits/chosen": -2.1306700706481934, "logits/rejected": -2.129149913787842, "logps/chosen": -0.4557907283306122, "logps/rejected": -2.641127586364746, "loss": 0.5917, "rewards/accuracies": 1.0, "rewards/chosen": 0.9992494583129883, "rewards/margins": 0.21444660425186157, "rewards/rejected": 0.7848028540611267, "step": 6700 }, { "epoch": 3.61, "learning_rate": 2.418240520841608e-09, "logits/chosen": -2.0829524993896484, "logits/rejected": -2.280935525894165, "logps/chosen": -1.7463700771331787, "logps/rejected": -1.6385340690612793, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.9298454523086548, "rewards/margins": 0.008032023906707764, "rewards/rejected": 0.921813428401947, "step": 6701 }, { "epoch": 3.61, "learning_rate": 2.411535810761839e-09, "logits/chosen": -2.0130507946014404, "logits/rejected": -2.0114541053771973, "logps/chosen": -0.2525579035282135, "logps/rejected": -6.007318496704102, "loss": 0.398, "rewards/accuracies": 1.0, "rewards/chosen": 1.0549432039260864, "rewards/margins": 0.7156493663787842, "rewards/rejected": 0.33929386734962463, "step": 6702 }, { "epoch": 3.62, "learning_rate": 2.4048401784966366e-09, "logits/chosen": -2.250027656555176, "logits/rejected": -2.1371355056762695, "logps/chosen": -24.943166732788086, "logps/rejected": -5.281135082244873, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 2.6737112998962402, "rewards/margins": 2.2662549018859863, "rewards/rejected": 0.4074564576148987, "step": 6703 }, { "epoch": 3.62, "learning_rate": 2.3981536253232295e-09, "logits/chosen": -2.3770298957824707, "logits/rejected": -2.3188998699188232, "logps/chosen": -18.59996223449707, "logps/rejected": -6.574460506439209, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": 2.5578954219818115, "rewards/margins": 2.237311601638794, "rewards/rejected": 0.32058387994766235, "step": 6704 }, { "epoch": 3.62, "learning_rate": 2.3914761525171355e-09, "logits/chosen": -2.084137439727783, "logits/rejected": -1.9739904403686523, "logps/chosen": -26.739253997802734, "logps/rejected": -1.8289337158203125, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 2.658066511154175, "rewards/margins": 1.8414255380630493, "rewards/rejected": 0.8166409730911255, "step": 6705 }, { "epoch": 3.62, "learning_rate": 2.3848077613521145e-09, "logits/chosen": -2.268289804458618, "logits/rejected": -2.116004228591919, "logps/chosen": -48.77162170410156, "logps/rejected": -20.943897247314453, "loss": 0.133, "rewards/accuracies": 1.0, "rewards/chosen": 3.067186117172241, "rewards/margins": 1.9498364925384521, "rewards/rejected": 1.117349624633789, "step": 6706 }, { "epoch": 3.62, "learning_rate": 2.378148453100215e-09, "logits/chosen": -2.1308040618896484, "logits/rejected": -2.280681610107422, "logps/chosen": -1.349792718887329, "logps/rejected": -4.772739887237549, "loss": 0.6272, "rewards/accuracies": 1.0, "rewards/chosen": 0.9284172058105469, "rewards/margins": 0.13661175966262817, "rewards/rejected": 0.7918054461479187, "step": 6707 }, { "epoch": 3.62, "learning_rate": 2.3714982290317376e-09, "logits/chosen": -2.305612087249756, "logits/rejected": -2.144862413406372, "logps/chosen": -24.267080307006836, "logps/rejected": -4.300485134124756, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 2.572967052459717, "rewards/margins": 2.091878652572632, "rewards/rejected": 0.4810883104801178, "step": 6708 }, { "epoch": 3.62, "learning_rate": 2.364857090415262e-09, "logits/chosen": -2.058468818664551, "logits/rejected": -2.0509033203125, "logps/chosen": -4.311278820037842, "logps/rejected": -4.50853967666626, "loss": 0.2598, "rewards/accuracies": 1.0, "rewards/chosen": 1.6344051361083984, "rewards/margins": 1.2152220010757446, "rewards/rejected": 0.4191831052303314, "step": 6709 }, { "epoch": 3.62, "learning_rate": 2.3582250385176194e-09, "logits/chosen": -2.1495094299316406, "logits/rejected": -2.337603807449341, "logps/chosen": -1.3261843919754028, "logps/rejected": -0.9241990447044373, "loss": 0.7194, "rewards/accuracies": 0.0, "rewards/chosen": 0.9979033470153809, "rewards/margins": -0.0517730712890625, "rewards/rejected": 1.0496764183044434, "step": 6710 }, { "epoch": 3.62, "learning_rate": 2.3516020746039255e-09, "logits/chosen": -2.234886407852173, "logits/rejected": -2.0516326427459717, "logps/chosen": -31.575153350830078, "logps/rejected": -3.648038148880005, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 3.2127397060394287, "rewards/margins": 2.737076997756958, "rewards/rejected": 0.47566261887550354, "step": 6711 }, { "epoch": 3.62, "learning_rate": 2.3449881999375586e-09, "logits/chosen": -2.0965781211853027, "logits/rejected": -2.2760794162750244, "logps/chosen": -2.6562814712524414, "logps/rejected": -2.6273999214172363, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 1.0203349590301514, "rewards/margins": -0.00666356086730957, "rewards/rejected": 1.026998519897461, "step": 6712 }, { "epoch": 3.62, "learning_rate": 2.3383834157801487e-09, "logits/chosen": -2.0485265254974365, "logits/rejected": -2.1033709049224854, "logps/chosen": -3.446364402770996, "logps/rejected": -22.788265228271484, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": 1.0418051481246948, "rewards/margins": 0.6927252411842346, "rewards/rejected": 0.3490799069404602, "step": 6713 }, { "epoch": 3.62, "learning_rate": 2.3317877233916093e-09, "logits/chosen": -2.0755724906921387, "logits/rejected": -2.1270041465759277, "logps/chosen": -2.535421133041382, "logps/rejected": -19.186466217041016, "loss": 0.1797, "rewards/accuracies": 1.0, "rewards/chosen": 1.5531667470932007, "rewards/margins": 1.6255102157592773, "rewards/rejected": -0.07234344631433487, "step": 6714 }, { "epoch": 3.62, "learning_rate": 2.3252011240301073e-09, "logits/chosen": -2.044955015182495, "logits/rejected": -2.259134292602539, "logps/chosen": -0.2913389801979065, "logps/rejected": -0.29468029737472534, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.8752536177635193, "rewards/margins": 0.015647709369659424, "rewards/rejected": 0.8596059083938599, "step": 6715 }, { "epoch": 3.62, "learning_rate": 2.318623618952087e-09, "logits/chosen": -2.1098337173461914, "logits/rejected": -2.1103320121765137, "logps/chosen": -3.4823975563049316, "logps/rejected": -2.356794834136963, "loss": 0.3249, "rewards/accuracies": 1.0, "rewards/chosen": 1.5368775129318237, "rewards/margins": 0.9575108885765076, "rewards/rejected": 0.5793666243553162, "step": 6716 }, { "epoch": 3.62, "learning_rate": 2.3120552094122447e-09, "logits/chosen": -2.106480598449707, "logits/rejected": -2.330779790878296, "logps/chosen": -0.3347795307636261, "logps/rejected": -0.3350379467010498, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0259144306182861, "rewards/margins": 0.0134657621383667, "rewards/rejected": 1.0124486684799194, "step": 6717 }, { "epoch": 3.62, "learning_rate": 2.3054958966635508e-09, "logits/chosen": -2.0444183349609375, "logits/rejected": -2.0323431491851807, "logps/chosen": -5.757761001586914, "logps/rejected": -15.60532283782959, "loss": 0.4065, "rewards/accuracies": 1.0, "rewards/chosen": 1.3267452716827393, "rewards/margins": 0.6901684999465942, "rewards/rejected": 0.636576771736145, "step": 6718 }, { "epoch": 3.62, "learning_rate": 2.298945681957237e-09, "logits/chosen": -2.0750551223754883, "logits/rejected": -2.073439598083496, "logps/chosen": -4.006523132324219, "logps/rejected": -7.177062034606934, "loss": 0.459, "rewards/accuracies": 1.0, "rewards/chosen": 1.4179407358169556, "rewards/margins": 0.5405237078666687, "rewards/rejected": 0.8774170279502869, "step": 6719 }, { "epoch": 3.62, "learning_rate": 2.2924045665427994e-09, "logits/chosen": -2.045105218887329, "logits/rejected": -2.055171012878418, "logps/chosen": -1.1490026712417603, "logps/rejected": -2.58501935005188, "loss": 0.4822, "rewards/accuracies": 1.0, "rewards/chosen": 1.0838425159454346, "rewards/margins": 0.47868138551712036, "rewards/rejected": 0.6051611304283142, "step": 6720 }, { "epoch": 3.63, "learning_rate": 2.285872551668e-09, "logits/chosen": -2.256070375442505, "logits/rejected": -2.1634902954101562, "logps/chosen": -19.007469177246094, "logps/rejected": -4.203433513641357, "loss": 0.1972, "rewards/accuracies": 1.0, "rewards/chosen": 2.154653310775757, "rewards/margins": 1.5235909223556519, "rewards/rejected": 0.631062388420105, "step": 6721 }, { "epoch": 3.63, "learning_rate": 2.2793496385788645e-09, "logits/chosen": -2.1722872257232666, "logits/rejected": -2.1679329872131348, "logps/chosen": -2.1667985916137695, "logps/rejected": -6.420226573944092, "loss": 0.5499, "rewards/accuracies": 1.0, "rewards/chosen": 0.9753640294075012, "rewards/margins": 0.31057268381118774, "rewards/rejected": 0.6647913455963135, "step": 6722 }, { "epoch": 3.63, "learning_rate": 2.2728358285196813e-09, "logits/chosen": -2.1490139961242676, "logits/rejected": -2.1433143615722656, "logps/chosen": -5.305309295654297, "logps/rejected": -6.6601948738098145, "loss": 0.3208, "rewards/accuracies": 1.0, "rewards/chosen": 1.2568457126617432, "rewards/margins": 0.9723450541496277, "rewards/rejected": 0.2845006585121155, "step": 6723 }, { "epoch": 3.63, "learning_rate": 2.2663311227330006e-09, "logits/chosen": -2.1145107746124268, "logits/rejected": -2.1139605045318604, "logps/chosen": -1.097479224205017, "logps/rejected": -2.243556499481201, "loss": 0.5289, "rewards/accuracies": 1.0, "rewards/chosen": 1.142364501953125, "rewards/margins": 0.36086344718933105, "rewards/rejected": 0.781501054763794, "step": 6724 }, { "epoch": 3.63, "learning_rate": 2.2598355224596466e-09, "logits/chosen": -2.134567975997925, "logits/rejected": -2.3239288330078125, "logps/chosen": -3.469907283782959, "logps/rejected": -3.165843963623047, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": 1.043142318725586, "rewards/margins": -0.008936166763305664, "rewards/rejected": 1.0520784854888916, "step": 6725 }, { "epoch": 3.63, "learning_rate": 2.253349028938678e-09, "logits/chosen": -1.9781036376953125, "logits/rejected": -2.2905426025390625, "logps/chosen": -1.592272400856018, "logps/rejected": -1.341172456741333, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.7506361603736877, "rewards/margins": -0.013952970504760742, "rewards/rejected": 0.7645891308784485, "step": 6726 }, { "epoch": 3.63, "learning_rate": 2.246871643407461e-09, "logits/chosen": -2.0172367095947266, "logits/rejected": -2.018341541290283, "logps/chosen": -0.864983081817627, "logps/rejected": -3.581833600997925, "loss": 0.4435, "rewards/accuracies": 1.0, "rewards/chosen": 1.1702721118927002, "rewards/margins": 0.5831227898597717, "rewards/rejected": 0.5871493220329285, "step": 6727 }, { "epoch": 3.63, "learning_rate": 2.240403367101584e-09, "logits/chosen": -2.0951101779937744, "logits/rejected": -2.0977249145507812, "logps/chosen": -2.56615948677063, "logps/rejected": -13.08967113494873, "loss": 0.2712, "rewards/accuracies": 1.0, "rewards/chosen": 1.293703556060791, "rewards/margins": 1.1661545038223267, "rewards/rejected": 0.12754908204078674, "step": 6728 }, { "epoch": 3.63, "learning_rate": 2.233944201254917e-09, "logits/chosen": -2.092897653579712, "logits/rejected": -2.2525973320007324, "logps/chosen": -5.916586875915527, "logps/rejected": -0.7211043238639832, "loss": 0.6309, "rewards/accuracies": 1.0, "rewards/chosen": 0.983440101146698, "rewards/margins": 0.1285858154296875, "rewards/rejected": 0.8548542857170105, "step": 6729 }, { "epoch": 3.63, "learning_rate": 2.2274941470995956e-09, "logits/chosen": -2.0046136379241943, "logits/rejected": -1.9998998641967773, "logps/chosen": -0.9569237232208252, "logps/rejected": -5.466858863830566, "loss": 0.4376, "rewards/accuracies": 1.0, "rewards/chosen": 1.0498216152191162, "rewards/margins": 0.5996614694595337, "rewards/rejected": 0.45016011595726013, "step": 6730 }, { "epoch": 3.63, "learning_rate": 2.221053205866008e-09, "logits/chosen": -2.176058053970337, "logits/rejected": -2.140021562576294, "logps/chosen": -4.410394668579102, "logps/rejected": -9.761499404907227, "loss": 0.2338, "rewards/accuracies": 1.0, "rewards/chosen": 1.353102684020996, "rewards/margins": 1.3339354991912842, "rewards/rejected": 0.019167233258485794, "step": 6731 }, { "epoch": 3.63, "learning_rate": 2.2146213787827995e-09, "logits/chosen": -2.079493284225464, "logits/rejected": -2.299379587173462, "logps/chosen": -0.7496751546859741, "logps/rejected": -0.9221081733703613, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.7564637064933777, "rewards/margins": -0.0004786252975463867, "rewards/rejected": 0.7569423317909241, "step": 6732 }, { "epoch": 3.63, "learning_rate": 2.2081986670768826e-09, "logits/chosen": -1.9976295232772827, "logits/rejected": -2.2126104831695557, "logps/chosen": -2.251451253890991, "logps/rejected": -2.2079110145568848, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 1.0223530530929565, "rewards/margins": 0.019425392150878906, "rewards/rejected": 1.0029276609420776, "step": 6733 }, { "epoch": 3.63, "learning_rate": 2.201785071973439e-09, "logits/chosen": -2.030276298522949, "logits/rejected": -2.116748809814453, "logps/chosen": -1.6747300624847412, "logps/rejected": -18.283679962158203, "loss": 0.7417, "rewards/accuracies": 0.0, "rewards/chosen": 0.9069135785102844, "rewards/margins": -0.09493809938430786, "rewards/rejected": 1.0018516778945923, "step": 6734 }, { "epoch": 3.63, "learning_rate": 2.1953805946959005e-09, "logits/chosen": -2.0721468925476074, "logits/rejected": -2.085106372833252, "logps/chosen": -4.245584964752197, "logps/rejected": -10.290605545043945, "loss": 0.2671, "rewards/accuracies": 1.0, "rewards/chosen": 1.725947380065918, "rewards/margins": 1.1835081577301025, "rewards/rejected": 0.5424392819404602, "step": 6735 }, { "epoch": 3.63, "learning_rate": 2.188985236465962e-09, "logits/chosen": -2.0077590942382812, "logits/rejected": -2.016979932785034, "logps/chosen": -3.301325559616089, "logps/rejected": -1.6437079906463623, "loss": 0.6236, "rewards/accuracies": 1.0, "rewards/chosen": 1.0836331844329834, "rewards/margins": 0.14432722330093384, "rewards/rejected": 0.9393059611320496, "step": 6736 }, { "epoch": 3.63, "learning_rate": 2.1825989985035877e-09, "logits/chosen": -1.992701768875122, "logits/rejected": -2.287572145462036, "logps/chosen": -0.3201928436756134, "logps/rejected": -0.32942771911621094, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.8982188105583191, "rewards/margins": 0.004836916923522949, "rewards/rejected": 0.8933818936347961, "step": 6737 }, { "epoch": 3.63, "learning_rate": 2.1762218820269853e-09, "logits/chosen": -2.341554641723633, "logits/rejected": -2.1933681964874268, "logps/chosen": -31.912994384765625, "logps/rejected": -2.155261993408203, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 2.8957679271698, "rewards/margins": 2.0366766452789307, "rewards/rejected": 0.8590912818908691, "step": 6738 }, { "epoch": 3.63, "learning_rate": 2.169853888252643e-09, "logits/chosen": -2.132281541824341, "logits/rejected": -2.281343698501587, "logps/chosen": -0.6150431036949158, "logps/rejected": -0.7225444912910461, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 1.0277823209762573, "rewards/margins": -0.004757881164550781, "rewards/rejected": 1.032540202140808, "step": 6739 }, { "epoch": 3.64, "learning_rate": 2.1634950183952894e-09, "logits/chosen": -2.1318562030792236, "logits/rejected": -2.130951404571533, "logps/chosen": -4.168896198272705, "logps/rejected": -6.022124290466309, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 1.401605486869812, "rewards/margins": 0.9709523916244507, "rewards/rejected": 0.43065309524536133, "step": 6740 }, { "epoch": 3.64, "learning_rate": 2.1571452736679264e-09, "logits/chosen": -2.0755183696746826, "logits/rejected": -2.3582711219787598, "logps/chosen": -2.7309417724609375, "logps/rejected": -3.1211178302764893, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.8047159314155579, "rewards/margins": 0.0055942535400390625, "rewards/rejected": 0.7991216778755188, "step": 6741 }, { "epoch": 3.64, "learning_rate": 2.1508046552818016e-09, "logits/chosen": -2.087923049926758, "logits/rejected": -2.1345057487487793, "logps/chosen": -4.180452823638916, "logps/rejected": -24.71088409423828, "loss": 0.1997, "rewards/accuracies": 1.0, "rewards/chosen": 1.2810331583023071, "rewards/margins": 1.5096371173858643, "rewards/rejected": -0.22860394418239594, "step": 6742 }, { "epoch": 3.64, "learning_rate": 2.144473164446442e-09, "logits/chosen": -2.087479591369629, "logits/rejected": -2.2667713165283203, "logps/chosen": -0.29482343792915344, "logps/rejected": -0.32718825340270996, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.8647276759147644, "rewards/margins": 0.03531700372695923, "rewards/rejected": 0.8294106721878052, "step": 6743 }, { "epoch": 3.64, "learning_rate": 2.1381508023696094e-09, "logits/chosen": -2.0550732612609863, "logits/rejected": -2.2480204105377197, "logps/chosen": -0.6247110366821289, "logps/rejected": -0.5285879969596863, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.8401679992675781, "rewards/margins": 0.030329406261444092, "rewards/rejected": 0.809838593006134, "step": 6744 }, { "epoch": 3.64, "learning_rate": 2.1318375702573442e-09, "logits/chosen": -2.2148025035858154, "logits/rejected": -2.2143383026123047, "logps/chosen": -3.094048023223877, "logps/rejected": -4.294009685516357, "loss": 0.4545, "rewards/accuracies": 1.0, "rewards/chosen": 1.5552109479904175, "rewards/margins": 0.552820086479187, "rewards/rejected": 1.0023908615112305, "step": 6745 }, { "epoch": 3.64, "learning_rate": 2.1255334693139395e-09, "logits/chosen": -2.0185694694519043, "logits/rejected": -2.01086163520813, "logps/chosen": -4.684142112731934, "logps/rejected": -0.9558428525924683, "loss": 0.4811, "rewards/accuracies": 1.0, "rewards/chosen": 1.2566735744476318, "rewards/margins": 0.4814091920852661, "rewards/rejected": 0.7752643823623657, "step": 6746 }, { "epoch": 3.64, "learning_rate": 2.1192385007419324e-09, "logits/chosen": -2.113636016845703, "logits/rejected": -2.1235191822052, "logps/chosen": -4.766367435455322, "logps/rejected": -3.930997371673584, "loss": 0.2685, "rewards/accuracies": 1.0, "rewards/chosen": 1.729310393333435, "rewards/margins": 1.177748441696167, "rewards/rejected": 0.5515618920326233, "step": 6747 }, { "epoch": 3.64, "learning_rate": 2.112952665742146e-09, "logits/chosen": -2.0676281452178955, "logits/rejected": -2.0697691440582275, "logps/chosen": -0.15691882371902466, "logps/rejected": -5.851083755493164, "loss": 0.4966, "rewards/accuracies": 1.0, "rewards/chosen": 0.944465160369873, "rewards/margins": 0.441436767578125, "rewards/rejected": 0.503028392791748, "step": 6748 }, { "epoch": 3.64, "learning_rate": 2.1066759655136214e-09, "logits/chosen": -2.032644033432007, "logits/rejected": -2.2845427989959717, "logps/chosen": -0.731453537940979, "logps/rejected": -0.706261157989502, "loss": 0.6654, "rewards/accuracies": 1.0, "rewards/chosen": 1.0671519041061401, "rewards/margins": 0.05632209777832031, "rewards/rejected": 1.0108298063278198, "step": 6749 }, { "epoch": 3.64, "learning_rate": 2.1004084012537003e-09, "logits/chosen": -2.0626707077026367, "logits/rejected": -2.072923421859741, "logps/chosen": -1.9727057218551636, "logps/rejected": -1.2854852676391602, "loss": 0.3465, "rewards/accuracies": 1.0, "rewards/chosen": 1.636208415031433, "rewards/margins": 0.8815343379974365, "rewards/rejected": 0.7546740770339966, "step": 6750 }, { "epoch": 3.64, "learning_rate": 2.0941499741579606e-09, "logits/chosen": -2.1598782539367676, "logits/rejected": -2.3203744888305664, "logps/chosen": -1.768790364265442, "logps/rejected": -2.004179000854492, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.7475051879882812, "rewards/margins": 0.03575456142425537, "rewards/rejected": 0.7117506265640259, "step": 6751 }, { "epoch": 3.64, "learning_rate": 2.0879006854202295e-09, "logits/chosen": -2.1665916442871094, "logits/rejected": -2.1637728214263916, "logps/chosen": -3.879774570465088, "logps/rejected": -5.349266529083252, "loss": 0.2248, "rewards/accuracies": 1.0, "rewards/chosen": 1.7109917402267456, "rewards/margins": 1.3781589269638062, "rewards/rejected": 0.33283278346061707, "step": 6752 }, { "epoch": 3.64, "learning_rate": 2.08166053623261e-09, "logits/chosen": -2.1439990997314453, "logits/rejected": -2.1416499614715576, "logps/chosen": -4.2903265953063965, "logps/rejected": -2.620120048522949, "loss": 0.2939, "rewards/accuracies": 1.0, "rewards/chosen": 1.7089439630508423, "rewards/margins": 1.0738391876220703, "rewards/rejected": 0.6351048350334167, "step": 6753 }, { "epoch": 3.64, "learning_rate": 2.075429527785444e-09, "logits/chosen": -2.2200350761413574, "logits/rejected": -2.134413957595825, "logps/chosen": -29.084081649780273, "logps/rejected": -3.0857791900634766, "loss": 0.1595, "rewards/accuracies": 1.0, "rewards/chosen": 2.425096273422241, "rewards/margins": 1.754713535308838, "rewards/rejected": 0.6703826785087585, "step": 6754 }, { "epoch": 3.64, "learning_rate": 2.069207661267347e-09, "logits/chosen": -2.0514888763427734, "logits/rejected": -2.054692506790161, "logps/chosen": -1.4764952659606934, "logps/rejected": -1.401214361190796, "loss": 0.5316, "rewards/accuracies": 1.0, "rewards/chosen": 1.0793393850326538, "rewards/margins": 0.3544188141822815, "rewards/rejected": 0.7249205708503723, "step": 6755 }, { "epoch": 3.64, "learning_rate": 2.06299493786517e-09, "logits/chosen": -1.9549410343170166, "logits/rejected": -2.2565362453460693, "logps/chosen": -0.5580198764801025, "logps/rejected": -0.5820788145065308, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.8377816081047058, "rewards/margins": 0.02013111114501953, "rewards/rejected": 0.8176504969596863, "step": 6756 }, { "epoch": 3.64, "learning_rate": 2.0567913587640372e-09, "logits/chosen": -2.0685901641845703, "logits/rejected": -2.0398366451263428, "logps/chosen": -12.95681381225586, "logps/rejected": -2.6255204677581787, "loss": 0.2591, "rewards/accuracies": 1.0, "rewards/chosen": 1.7248986959457397, "rewards/margins": 1.2181193828582764, "rewards/rejected": 0.5067793130874634, "step": 6757 }, { "epoch": 3.65, "learning_rate": 2.0505969251473244e-09, "logits/chosen": -2.0662198066711426, "logits/rejected": -2.3199727535247803, "logps/chosen": -0.054955288767814636, "logps/rejected": -0.04942841827869415, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.8711969256401062, "rewards/margins": 0.026427745819091797, "rewards/rejected": 0.8447691798210144, "step": 6758 }, { "epoch": 3.65, "learning_rate": 2.0444116381966526e-09, "logits/chosen": -2.124696731567383, "logits/rejected": -2.1274867057800293, "logps/chosen": -3.0923213958740234, "logps/rejected": -4.661189079284668, "loss": 0.4152, "rewards/accuracies": 1.0, "rewards/chosen": 1.0655230283737183, "rewards/margins": 0.6641459465026855, "rewards/rejected": 0.4013771116733551, "step": 6759 }, { "epoch": 3.65, "learning_rate": 2.038235499091917e-09, "logits/chosen": -2.2543673515319824, "logits/rejected": -2.2539074420928955, "logps/chosen": -1.6341331005096436, "logps/rejected": -1.2350883483886719, "loss": 0.5493, "rewards/accuracies": 1.0, "rewards/chosen": 1.006854772567749, "rewards/margins": 0.31181156635284424, "rewards/rejected": 0.6950432062149048, "step": 6760 }, { "epoch": 3.65, "learning_rate": 2.0320685090112476e-09, "logits/chosen": -2.101306438446045, "logits/rejected": -2.337087392807007, "logps/chosen": -0.610442042350769, "logps/rejected": -0.5336226224899292, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.9955951571464539, "rewards/margins": -0.012646734714508057, "rewards/rejected": 1.008241891860962, "step": 6761 }, { "epoch": 3.65, "learning_rate": 2.025910669131048e-09, "logits/chosen": -2.1146273612976074, "logits/rejected": -2.113696336746216, "logps/chosen": -0.5442378520965576, "logps/rejected": -1.852308750152588, "loss": 0.6165, "rewards/accuracies": 1.0, "rewards/chosen": 0.9680294990539551, "rewards/margins": 0.1597035527229309, "rewards/rejected": 0.8083259463310242, "step": 6762 }, { "epoch": 3.65, "learning_rate": 2.019761980625956e-09, "logits/chosen": -1.9617758989334106, "logits/rejected": -2.23479962348938, "logps/chosen": -0.21683189272880554, "logps/rejected": -0.23583614826202393, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.9477003216743469, "rewards/margins": -0.013240993022918701, "rewards/rejected": 0.9609413146972656, "step": 6763 }, { "epoch": 3.65, "learning_rate": 2.0136224446688833e-09, "logits/chosen": -2.1089224815368652, "logits/rejected": -2.105337381362915, "logps/chosen": -2.20910906791687, "logps/rejected": -6.343870639801025, "loss": 0.3485, "rewards/accuracies": 1.0, "rewards/chosen": 1.33733069896698, "rewards/margins": 0.8749778270721436, "rewards/rejected": 0.4623529016971588, "step": 6764 }, { "epoch": 3.65, "learning_rate": 2.0074920624309822e-09, "logits/chosen": -2.1128251552581787, "logits/rejected": -2.1151201725006104, "logps/chosen": -5.5514020919799805, "logps/rejected": -9.986519813537598, "loss": 0.2391, "rewards/accuracies": 1.0, "rewards/chosen": 1.6303653717041016, "rewards/margins": 1.309174656867981, "rewards/rejected": 0.321190744638443, "step": 6765 }, { "epoch": 3.65, "learning_rate": 2.0013708350816673e-09, "logits/chosen": -2.0398941040039062, "logits/rejected": -2.04689884185791, "logps/chosen": -0.694218099117279, "logps/rejected": -3.8972136974334717, "loss": 0.4714, "rewards/accuracies": 1.0, "rewards/chosen": 0.9878214001655579, "rewards/margins": 0.5070584416389465, "rewards/rejected": 0.48076295852661133, "step": 6766 }, { "epoch": 3.65, "learning_rate": 1.9952587637885987e-09, "logits/chosen": -1.9524983167648315, "logits/rejected": -2.2717440128326416, "logps/chosen": -2.5076231956481934, "logps/rejected": -5.38969612121582, "loss": 0.6616, "rewards/accuracies": 1.0, "rewards/chosen": 0.9156305193901062, "rewards/margins": 0.06406915187835693, "rewards/rejected": 0.8515613675117493, "step": 6767 }, { "epoch": 3.65, "learning_rate": 1.9891558497176995e-09, "logits/chosen": -1.981450080871582, "logits/rejected": -1.9890916347503662, "logps/chosen": -1.8651610612869263, "logps/rejected": -4.086956024169922, "loss": 0.3986, "rewards/accuracies": 1.0, "rewards/chosen": 1.177613615989685, "rewards/margins": 0.71391761302948, "rewards/rejected": 0.4636960029602051, "step": 6768 }, { "epoch": 3.65, "learning_rate": 1.9830620940331387e-09, "logits/chosen": -2.1318302154541016, "logits/rejected": -2.116943597793579, "logps/chosen": -3.2436728477478027, "logps/rejected": -5.04291296005249, "loss": 0.3077, "rewards/accuracies": 1.0, "rewards/chosen": 1.7455295324325562, "rewards/margins": 1.0209457874298096, "rewards/rejected": 0.7245838046073914, "step": 6769 }, { "epoch": 3.65, "learning_rate": 1.976977497897342e-09, "logits/chosen": -2.0773158073425293, "logits/rejected": -2.3439204692840576, "logps/chosen": -0.6465518474578857, "logps/rejected": -0.5336203575134277, "loss": 0.6768, "rewards/accuracies": 1.0, "rewards/chosen": 1.0311919450759888, "rewards/margins": 0.032991886138916016, "rewards/rejected": 0.9982000589370728, "step": 6770 }, { "epoch": 3.65, "learning_rate": 1.970902062470975e-09, "logits/chosen": -2.1672213077545166, "logits/rejected": -2.054110288619995, "logps/chosen": -20.067989349365234, "logps/rejected": -4.156757354736328, "loss": 0.1115, "rewards/accuracies": 1.0, "rewards/chosen": 2.645099639892578, "rewards/margins": 2.1371185779571533, "rewards/rejected": 0.50798100233078, "step": 6771 }, { "epoch": 3.65, "learning_rate": 1.964835788912983e-09, "logits/chosen": -1.977370023727417, "logits/rejected": -1.9756906032562256, "logps/chosen": -0.7223492860794067, "logps/rejected": -3.9789326190948486, "loss": 0.4947, "rewards/accuracies": 1.0, "rewards/chosen": 1.1774088144302368, "rewards/margins": 0.44638872146606445, "rewards/rejected": 0.7310200929641724, "step": 6772 }, { "epoch": 3.65, "learning_rate": 1.9587786783805405e-09, "logits/chosen": -2.068375825881958, "logits/rejected": -2.2845962047576904, "logps/chosen": -0.2362641841173172, "logps/rejected": -0.24595032632350922, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.9397552609443665, "rewards/margins": 0.023962795734405518, "rewards/rejected": 0.9157924652099609, "step": 6773 }, { "epoch": 3.65, "learning_rate": 1.9527307320290896e-09, "logits/chosen": -2.0387251377105713, "logits/rejected": -2.3455655574798584, "logps/chosen": -0.5905796885490417, "logps/rejected": -0.5373555421829224, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 1.0710026025772095, "rewards/margins": 0.02468705177307129, "rewards/rejected": 1.0463155508041382, "step": 6774 }, { "epoch": 3.65, "learning_rate": 1.9466919510123026e-09, "logits/chosen": -2.0339441299438477, "logits/rejected": -2.314610242843628, "logps/chosen": -0.9922329187393188, "logps/rejected": -0.6997660994529724, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.8725377917289734, "rewards/margins": 0.028113603591918945, "rewards/rejected": 0.8444241881370544, "step": 6775 }, { "epoch": 3.65, "learning_rate": 1.940662336482124e-09, "logits/chosen": -2.05841326713562, "logits/rejected": -2.064579963684082, "logps/chosen": -1.3613922595977783, "logps/rejected": -2.489204168319702, "loss": 0.4415, "rewards/accuracies": 1.0, "rewards/chosen": 1.2254475355148315, "rewards/margins": 0.5887725353240967, "rewards/rejected": 0.6366750001907349, "step": 6776 }, { "epoch": 3.66, "learning_rate": 1.934641889588751e-09, "logits/chosen": -2.0922043323516846, "logits/rejected": -2.0972604751586914, "logps/chosen": -1.2506351470947266, "logps/rejected": -3.0074305534362793, "loss": 0.4239, "rewards/accuracies": 1.0, "rewards/chosen": 1.164749264717102, "rewards/margins": 0.6388146281242371, "rewards/rejected": 0.525934636592865, "step": 6777 }, { "epoch": 3.66, "learning_rate": 1.9286306114806084e-09, "logits/chosen": -2.1425576210021973, "logits/rejected": -2.3848538398742676, "logps/chosen": -0.41178908944129944, "logps/rejected": -0.47455036640167236, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 1.0247372388839722, "rewards/margins": 0.0024639368057250977, "rewards/rejected": 1.022273302078247, "step": 6778 }, { "epoch": 3.66, "learning_rate": 1.9226285033043967e-09, "logits/chosen": -2.142158031463623, "logits/rejected": -2.1336700916290283, "logps/chosen": -10.514692306518555, "logps/rejected": -4.001245021820068, "loss": 0.3434, "rewards/accuracies": 1.0, "rewards/chosen": 1.4038034677505493, "rewards/margins": 0.892389178276062, "rewards/rejected": 0.5114142894744873, "step": 6779 }, { "epoch": 3.66, "learning_rate": 1.9166355662050492e-09, "logits/chosen": -2.2631232738494873, "logits/rejected": -2.2827255725860596, "logps/chosen": -11.301412582397461, "logps/rejected": -15.0048828125, "loss": 0.4518, "rewards/accuracies": 1.0, "rewards/chosen": 1.9343976974487305, "rewards/margins": 0.560003399848938, "rewards/rejected": 1.3743942975997925, "step": 6780 }, { "epoch": 3.66, "learning_rate": 1.9106518013257687e-09, "logits/chosen": -1.954662561416626, "logits/rejected": -2.3020482063293457, "logps/chosen": -0.5625382661819458, "logps/rejected": -0.465583860874176, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.8785942196846008, "rewards/margins": 0.0038704872131347656, "rewards/rejected": 0.8747237324714661, "step": 6781 }, { "epoch": 3.66, "learning_rate": 1.9046772098079864e-09, "logits/chosen": -1.9987707138061523, "logits/rejected": -1.976076364517212, "logps/chosen": -9.38447093963623, "logps/rejected": -2.9068939685821533, "loss": 0.4623, "rewards/accuracies": 1.0, "rewards/chosen": 1.6185287237167358, "rewards/margins": 0.5313736200332642, "rewards/rejected": 1.0871551036834717, "step": 6782 }, { "epoch": 3.66, "learning_rate": 1.8987117927914077e-09, "logits/chosen": -2.1044909954071045, "logits/rejected": -2.2937419414520264, "logps/chosen": -0.5580209493637085, "logps/rejected": -0.6508052945137024, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": 0.7770060896873474, "rewards/margins": 0.04071468114852905, "rewards/rejected": 0.7362914085388184, "step": 6783 }, { "epoch": 3.66, "learning_rate": 1.892755551413966e-09, "logits/chosen": -2.1198885440826416, "logits/rejected": -2.309584379196167, "logps/chosen": -2.8266561031341553, "logps/rejected": -2.841886281967163, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": 0.6552576422691345, "rewards/margins": 0.06439650058746338, "rewards/rejected": 0.5908611416816711, "step": 6784 }, { "epoch": 3.66, "learning_rate": 1.886808486811864e-09, "logits/chosen": -2.074751138687134, "logits/rejected": -2.0747263431549072, "logps/chosen": -1.8269643783569336, "logps/rejected": -1.7791101932525635, "loss": 0.4494, "rewards/accuracies": 1.0, "rewards/chosen": 1.411346197128296, "rewards/margins": 0.5667744874954224, "rewards/rejected": 0.8445717096328735, "step": 6785 }, { "epoch": 3.66, "learning_rate": 1.880870600119533e-09, "logits/chosen": -1.9951938390731812, "logits/rejected": -2.0035433769226074, "logps/chosen": -4.112076759338379, "logps/rejected": -5.254961013793945, "loss": 0.2223, "rewards/accuracies": 1.0, "rewards/chosen": 1.7734425067901611, "rewards/margins": 1.390468716621399, "rewards/rejected": 0.3829737603664398, "step": 6786 }, { "epoch": 3.66, "learning_rate": 1.8749418924696723e-09, "logits/chosen": -2.116111993789673, "logits/rejected": -2.0284924507141113, "logps/chosen": -34.70616912841797, "logps/rejected": -4.636693954467773, "loss": 0.1153, "rewards/accuracies": 1.0, "rewards/chosen": 2.37322998046875, "rewards/margins": 2.1023313999176025, "rewards/rejected": 0.2708984911441803, "step": 6787 }, { "epoch": 3.66, "learning_rate": 1.869022364993217e-09, "logits/chosen": -2.0914275646209717, "logits/rejected": -2.308870315551758, "logps/chosen": -0.11312755197286606, "logps/rejected": -0.10158300399780273, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.9287683367729187, "rewards/margins": -0.005962967872619629, "rewards/rejected": 0.9347313046455383, "step": 6788 }, { "epoch": 3.66, "learning_rate": 1.8631120188193516e-09, "logits/chosen": -1.9873082637786865, "logits/rejected": -2.3072245121002197, "logps/chosen": -0.2805883288383484, "logps/rejected": -0.27484118938446045, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.8163051009178162, "rewards/margins": 0.008956432342529297, "rewards/rejected": 0.8073486685752869, "step": 6789 }, { "epoch": 3.66, "learning_rate": 1.857210855075525e-09, "logits/chosen": -2.115201473236084, "logits/rejected": -2.108576774597168, "logps/chosen": -0.48194777965545654, "logps/rejected": -4.440603256225586, "loss": 0.5454, "rewards/accuracies": 1.0, "rewards/chosen": 0.9549605250358582, "rewards/margins": 0.3211820125579834, "rewards/rejected": 0.6337785124778748, "step": 6790 }, { "epoch": 3.66, "learning_rate": 1.8513188748874197e-09, "logits/chosen": -2.10988450050354, "logits/rejected": -2.1075713634490967, "logps/chosen": -0.8364103436470032, "logps/rejected": -4.544210910797119, "loss": 0.4661, "rewards/accuracies": 1.0, "rewards/chosen": 1.0914430618286133, "rewards/margins": 0.5213416218757629, "rewards/rejected": 0.5701014399528503, "step": 6791 }, { "epoch": 3.66, "learning_rate": 1.8454360793789647e-09, "logits/chosen": -2.0818235874176025, "logits/rejected": -2.1363413333892822, "logps/chosen": -4.253483772277832, "logps/rejected": -12.6211576461792, "loss": 0.2896, "rewards/accuracies": 1.0, "rewards/chosen": 1.5955537557601929, "rewards/margins": 1.0908573865890503, "rewards/rejected": 0.5046963691711426, "step": 6792 }, { "epoch": 3.66, "learning_rate": 1.8395624696723455e-09, "logits/chosen": -2.0418734550476074, "logits/rejected": -2.3023457527160645, "logps/chosen": -0.11145390570163727, "logps/rejected": -0.15615777671337128, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9982349276542664, "rewards/margins": 0.01357412338256836, "rewards/rejected": 0.984660804271698, "step": 6793 }, { "epoch": 3.66, "learning_rate": 1.8336980468879937e-09, "logits/chosen": -2.146608591079712, "logits/rejected": -2.2975454330444336, "logps/chosen": -3.0417652130126953, "logps/rejected": -3.589104175567627, "loss": 0.8211, "rewards/accuracies": 0.0, "rewards/chosen": 0.9117532968521118, "rewards/margins": -0.24145102500915527, "rewards/rejected": 1.153204321861267, "step": 6794 }, { "epoch": 3.67, "learning_rate": 1.827842812144581e-09, "logits/chosen": -2.121959924697876, "logits/rejected": -2.2755846977233887, "logps/chosen": -0.13471029698848724, "logps/rejected": -0.12220628559589386, "loss": 0.6728, "rewards/accuracies": 1.0, "rewards/chosen": 0.954657256603241, "rewards/margins": 0.04120081663131714, "rewards/rejected": 0.9134564399719238, "step": 6795 }, { "epoch": 3.67, "learning_rate": 1.821996766559042e-09, "logits/chosen": -2.1391401290893555, "logits/rejected": -2.1400301456451416, "logps/chosen": -0.12499155104160309, "logps/rejected": -5.562436103820801, "loss": 0.4884, "rewards/accuracies": 1.0, "rewards/chosen": 0.8502707481384277, "rewards/margins": 0.46245792508125305, "rewards/rejected": 0.3878128230571747, "step": 6796 }, { "epoch": 3.67, "learning_rate": 1.8161599112465398e-09, "logits/chosen": -2.16538667678833, "logits/rejected": -2.171553373336792, "logps/chosen": -3.660132884979248, "logps/rejected": -3.8798635005950928, "loss": 0.545, "rewards/accuracies": 1.0, "rewards/chosen": 0.9548347592353821, "rewards/margins": 0.3220359683036804, "rewards/rejected": 0.6327987909317017, "step": 6797 }, { "epoch": 3.67, "learning_rate": 1.8103322473204952e-09, "logits/chosen": -2.2367403507232666, "logits/rejected": -2.16768741607666, "logps/chosen": -27.17731475830078, "logps/rejected": -6.757445335388184, "loss": 0.1779, "rewards/accuracies": 1.0, "rewards/chosen": 2.132331609725952, "rewards/margins": 1.6362977027893066, "rewards/rejected": 0.4960338771343231, "step": 6798 }, { "epoch": 3.67, "learning_rate": 1.80451377589258e-09, "logits/chosen": -2.092886447906494, "logits/rejected": -2.0862436294555664, "logps/chosen": -2.1942529678344727, "logps/rejected": -6.315062046051025, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": 1.015756607055664, "rewards/margins": 0.5660568475723267, "rewards/rejected": 0.449699729681015, "step": 6799 }, { "epoch": 3.67, "learning_rate": 1.798704498072695e-09, "logits/chosen": -2.0730886459350586, "logits/rejected": -2.0741372108459473, "logps/chosen": -5.093160152435303, "logps/rejected": -4.998409748077393, "loss": 0.2768, "rewards/accuracies": 1.0, "rewards/chosen": 1.5147466659545898, "rewards/margins": 1.142862319946289, "rewards/rejected": 0.3718843162059784, "step": 6800 }, { "epoch": 3.67, "learning_rate": 1.7929044149690043e-09, "logits/chosen": -1.9470117092132568, "logits/rejected": -1.952232837677002, "logps/chosen": -1.2965399026870728, "logps/rejected": -5.335906982421875, "loss": 0.4199, "rewards/accuracies": 1.0, "rewards/chosen": 0.9927197694778442, "rewards/margins": 0.6504867076873779, "rewards/rejected": 0.3422330915927887, "step": 6801 }, { "epoch": 3.67, "learning_rate": 1.7871135276879167e-09, "logits/chosen": -2.026991605758667, "logits/rejected": -2.3113701343536377, "logps/chosen": -0.31474822759628296, "logps/rejected": -4.853857040405273, "loss": 0.625, "rewards/accuracies": 1.0, "rewards/chosen": 0.8358103036880493, "rewards/margins": 0.14117634296417236, "rewards/rejected": 0.694633960723877, "step": 6802 }, { "epoch": 3.67, "learning_rate": 1.781331837334077e-09, "logits/chosen": -1.993188500404358, "logits/rejected": -2.3218019008636475, "logps/chosen": -1.950379729270935, "logps/rejected": -2.0018527507781982, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 1.0719350576400757, "rewards/margins": -0.004737377166748047, "rewards/rejected": 1.0766724348068237, "step": 6803 }, { "epoch": 3.67, "learning_rate": 1.77555934501038e-09, "logits/chosen": -2.0647811889648438, "logits/rejected": -2.275332450866699, "logps/chosen": -1.298757553100586, "logps/rejected": -1.6696093082427979, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 1.0704118013381958, "rewards/margins": 0.022717714309692383, "rewards/rejected": 1.0476940870285034, "step": 6804 }, { "epoch": 3.67, "learning_rate": 1.7697960518179623e-09, "logits/chosen": -2.1670427322387695, "logits/rejected": -2.1665875911712646, "logps/chosen": -0.785804808139801, "logps/rejected": -2.0175788402557373, "loss": 0.6668, "rewards/accuracies": 1.0, "rewards/chosen": 0.9065324068069458, "rewards/margins": 0.05346345901489258, "rewards/rejected": 0.8530689477920532, "step": 6805 }, { "epoch": 3.67, "learning_rate": 1.7640419588562217e-09, "logits/chosen": -2.0200283527374268, "logits/rejected": -2.015974283218384, "logps/chosen": -6.736880302429199, "logps/rejected": -4.60345458984375, "loss": 0.2867, "rewards/accuracies": 1.0, "rewards/chosen": 1.496545672416687, "rewards/margins": 1.1025561094284058, "rewards/rejected": 0.39398956298828125, "step": 6806 }, { "epoch": 3.67, "learning_rate": 1.7582970672227915e-09, "logits/chosen": -2.3174691200256348, "logits/rejected": -2.1913135051727295, "logps/chosen": -28.15509033203125, "logps/rejected": -2.647368907928467, "loss": 0.1707, "rewards/accuracies": 1.0, "rewards/chosen": 2.6881821155548096, "rewards/margins": 1.6815338134765625, "rewards/rejected": 1.006648302078247, "step": 6807 }, { "epoch": 3.67, "learning_rate": 1.752561378013534e-09, "logits/chosen": -2.197342872619629, "logits/rejected": -2.1938295364379883, "logps/chosen": -5.467222213745117, "logps/rejected": -6.992972373962402, "loss": 0.2424, "rewards/accuracies": 1.0, "rewards/chosen": 1.5022987127304077, "rewards/margins": 1.2936763763427734, "rewards/rejected": 0.20862236618995667, "step": 6808 }, { "epoch": 3.67, "learning_rate": 1.7468348923225796e-09, "logits/chosen": -2.1845133304595947, "logits/rejected": -2.310238838195801, "logps/chosen": -4.783586025238037, "logps/rejected": -1.0338106155395508, "loss": 0.7331, "rewards/accuracies": 0.0, "rewards/chosen": 1.00558340549469, "rewards/margins": -0.0782783031463623, "rewards/rejected": 1.0838617086410522, "step": 6809 }, { "epoch": 3.67, "learning_rate": 1.7411176112422822e-09, "logits/chosen": -2.212456226348877, "logits/rejected": -2.070096969604492, "logps/chosen": -28.316158294677734, "logps/rejected": -5.594515800476074, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 2.370335817337036, "rewards/margins": 1.9250669479370117, "rewards/rejected": 0.4452689290046692, "step": 6810 }, { "epoch": 3.67, "learning_rate": 1.7354095358632637e-09, "logits/chosen": -2.029646396636963, "logits/rejected": -2.043071746826172, "logps/chosen": -3.136133909225464, "logps/rejected": -0.9600023031234741, "loss": 0.5951, "rewards/accuracies": 1.0, "rewards/chosen": 0.9094882011413574, "rewards/margins": 0.20683413743972778, "rewards/rejected": 0.7026540637016296, "step": 6811 }, { "epoch": 3.67, "learning_rate": 1.7297106672743755e-09, "logits/chosen": -1.8728585243225098, "logits/rejected": -2.2752785682678223, "logps/chosen": -0.3634721040725708, "logps/rejected": -0.5006309151649475, "loss": 0.6691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8262101411819458, "rewards/margins": 0.0487484335899353, "rewards/rejected": 0.7774617075920105, "step": 6812 }, { "epoch": 3.67, "learning_rate": 1.7240210065627092e-09, "logits/chosen": -2.239375591278076, "logits/rejected": -2.109772205352783, "logps/chosen": -40.900516510009766, "logps/rejected": -10.580056190490723, "loss": 0.1516, "rewards/accuracies": 1.0, "rewards/chosen": 2.703529119491577, "rewards/margins": 1.810052514076233, "rewards/rejected": 0.8934766054153442, "step": 6813 }, { "epoch": 3.68, "learning_rate": 1.7183405548136076e-09, "logits/chosen": -2.1388325691223145, "logits/rejected": -2.283682346343994, "logps/chosen": -1.1080224514007568, "logps/rejected": -1.034727931022644, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 0.8343263864517212, "rewards/margins": -0.00901782512664795, "rewards/rejected": 0.8433442115783691, "step": 6814 }, { "epoch": 3.68, "learning_rate": 1.712669313110654e-09, "logits/chosen": -2.0969696044921875, "logits/rejected": -2.0444183349609375, "logps/chosen": -23.949806213378906, "logps/rejected": -3.6355974674224854, "loss": 0.2428, "rewards/accuracies": 1.0, "rewards/chosen": 1.9936904907226562, "rewards/margins": 1.2916206121444702, "rewards/rejected": 0.702069878578186, "step": 6815 }, { "epoch": 3.68, "learning_rate": 1.7070072825356719e-09, "logits/chosen": -2.1182143688201904, "logits/rejected": -2.1143548488616943, "logps/chosen": -0.1942443698644638, "logps/rejected": -5.763880729675293, "loss": 0.4186, "rewards/accuracies": 1.0, "rewards/chosen": 0.9970337152481079, "rewards/margins": 0.6542852520942688, "rewards/rejected": 0.3427484631538391, "step": 6816 }, { "epoch": 3.68, "learning_rate": 1.701354464168736e-09, "logits/chosen": -2.142580509185791, "logits/rejected": -2.143091917037964, "logps/chosen": -0.10810860991477966, "logps/rejected": -6.111189842224121, "loss": 0.4007, "rewards/accuracies": 1.0, "rewards/chosen": 1.01594877243042, "rewards/margins": 0.7073806524276733, "rewards/rejected": 0.3085680902004242, "step": 6817 }, { "epoch": 3.68, "learning_rate": 1.6957108590881618e-09, "logits/chosen": -2.0877556800842285, "logits/rejected": -2.0867831707000732, "logps/chosen": -1.4783834218978882, "logps/rejected": -4.601930141448975, "loss": 0.5279, "rewards/accuracies": 1.0, "rewards/chosen": 0.8092569708824158, "rewards/margins": 0.3632953464984894, "rewards/rejected": 0.4459616243839264, "step": 6818 }, { "epoch": 3.68, "learning_rate": 1.6900764683704993e-09, "logits/chosen": -2.1549510955810547, "logits/rejected": -2.3551061153411865, "logps/chosen": -2.4679510593414307, "logps/rejected": -4.793286323547363, "loss": 0.7219, "rewards/accuracies": 0.0, "rewards/chosen": 1.1401679515838623, "rewards/margins": -0.05676877498626709, "rewards/rejected": 1.1969367265701294, "step": 6819 }, { "epoch": 3.68, "learning_rate": 1.6844512930905441e-09, "logits/chosen": -2.086111068725586, "logits/rejected": -2.074740409851074, "logps/chosen": -6.790982246398926, "logps/rejected": -0.6755251288414001, "loss": 0.4489, "rewards/accuracies": 1.0, "rewards/chosen": 1.6060913801193237, "rewards/margins": 0.5681170225143433, "rewards/rejected": 1.0379743576049805, "step": 6820 }, { "epoch": 3.68, "learning_rate": 1.6788353343213436e-09, "logits/chosen": -2.1660666465759277, "logits/rejected": -2.38954496383667, "logps/chosen": -0.3000696301460266, "logps/rejected": -14.96536636352539, "loss": 0.7532, "rewards/accuracies": 0.0, "rewards/chosen": 0.8531946539878845, "rewards/margins": -0.1166730523109436, "rewards/rejected": 0.9698677062988281, "step": 6821 }, { "epoch": 3.68, "learning_rate": 1.6732285931341738e-09, "logits/chosen": -2.065648078918457, "logits/rejected": -2.286817789077759, "logps/chosen": -0.3026118278503418, "logps/rejected": -0.42134708166122437, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.9202930331230164, "rewards/margins": 0.007881700992584229, "rewards/rejected": 0.9124113321304321, "step": 6822 }, { "epoch": 3.68, "learning_rate": 1.667631070598552e-09, "logits/chosen": -2.051790237426758, "logits/rejected": -2.2511467933654785, "logps/chosen": -0.8917872905731201, "logps/rejected": -0.8426558971405029, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.9845995903015137, "rewards/margins": 0.004136443138122559, "rewards/rejected": 0.9804631471633911, "step": 6823 }, { "epoch": 3.68, "learning_rate": 1.6620427677822567e-09, "logits/chosen": -2.0744168758392334, "logits/rejected": -2.0775413513183594, "logps/chosen": -0.7863742709159851, "logps/rejected": -17.229740142822266, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 1.0846651792526245, "rewards/margins": 0.020165443420410156, "rewards/rejected": 1.0644997358322144, "step": 6824 }, { "epoch": 3.68, "learning_rate": 1.6564636857512804e-09, "logits/chosen": -1.996664047241211, "logits/rejected": -2.2894833087921143, "logps/chosen": -0.40001362562179565, "logps/rejected": -0.4462236762046814, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.9396980404853821, "rewards/margins": 0.020801842212677002, "rewards/rejected": 0.9188961982727051, "step": 6825 }, { "epoch": 3.68, "learning_rate": 1.6508938255698768e-09, "logits/chosen": -2.1037096977233887, "logits/rejected": -2.1002707481384277, "logps/chosen": -2.158050298690796, "logps/rejected": -6.806310176849365, "loss": 0.2513, "rewards/accuracies": 1.0, "rewards/chosen": 1.5603771209716797, "rewards/margins": 1.252779245376587, "rewards/rejected": 0.3075978755950928, "step": 6826 }, { "epoch": 3.68, "learning_rate": 1.645333188300524e-09, "logits/chosen": -2.0724143981933594, "logits/rejected": -2.334202527999878, "logps/chosen": -4.337446689605713, "logps/rejected": -4.154480934143066, "loss": 0.7027, "rewards/accuracies": 0.0, "rewards/chosen": 1.1593927145004272, "rewards/margins": -0.018946170806884766, "rewards/rejected": 1.178338885307312, "step": 6827 }, { "epoch": 3.68, "learning_rate": 1.6397817750039678e-09, "logits/chosen": -2.0669798851013184, "logits/rejected": -2.0680606365203857, "logps/chosen": -1.3370180130004883, "logps/rejected": -1.975911021232605, "loss": 0.6267, "rewards/accuracies": 1.0, "rewards/chosen": 1.230641484260559, "rewards/margins": 0.1375354528427124, "rewards/rejected": 1.0931060314178467, "step": 6828 }, { "epoch": 3.68, "learning_rate": 1.6342395867391611e-09, "logits/chosen": -2.073221445083618, "logits/rejected": -2.2558696269989014, "logps/chosen": -0.6978893280029297, "logps/rejected": -0.7693824768066406, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 0.9708172678947449, "rewards/margins": 0.036174774169921875, "rewards/rejected": 0.934642493724823, "step": 6829 }, { "epoch": 3.68, "learning_rate": 1.6287066245633252e-09, "logits/chosen": -1.9670122861862183, "logits/rejected": -1.9680696725845337, "logps/chosen": -1.5860131978988647, "logps/rejected": -0.6221018433570862, "loss": 0.6348, "rewards/accuracies": 1.0, "rewards/chosen": 1.080539584159851, "rewards/margins": 0.12025189399719238, "rewards/rejected": 0.9602876901626587, "step": 6830 }, { "epoch": 3.68, "learning_rate": 1.623182889531899e-09, "logits/chosen": -2.0181102752685547, "logits/rejected": -2.016988515853882, "logps/chosen": -5.402026176452637, "logps/rejected": -17.204113006591797, "loss": 0.39, "rewards/accuracies": 1.0, "rewards/chosen": 0.9121716618537903, "rewards/margins": 0.7402623295783997, "rewards/rejected": 0.17190933227539062, "step": 6831 }, { "epoch": 3.69, "learning_rate": 1.6176683826985782e-09, "logits/chosen": -1.9995346069335938, "logits/rejected": -1.994692087173462, "logps/chosen": -3.324998378753662, "logps/rejected": -8.673664093017578, "loss": 0.4651, "rewards/accuracies": 1.0, "rewards/chosen": 0.9582365155220032, "rewards/margins": 0.524034857749939, "rewards/rejected": 0.4342016279697418, "step": 6832 }, { "epoch": 3.69, "learning_rate": 1.612163105115283e-09, "logits/chosen": -2.1394078731536865, "logits/rejected": -2.265137195587158, "logps/chosen": -1.3267467021942139, "logps/rejected": -1.2558034658432007, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.8687981963157654, "rewards/margins": -0.01755768060684204, "rewards/rejected": 0.8863558769226074, "step": 6833 }, { "epoch": 3.69, "learning_rate": 1.606667057832195e-09, "logits/chosen": -2.0505802631378174, "logits/rejected": -2.048288583755493, "logps/chosen": -5.139091968536377, "logps/rejected": -4.626255512237549, "loss": 0.3891, "rewards/accuracies": 1.0, "rewards/chosen": 1.2002620697021484, "rewards/margins": 0.743085503578186, "rewards/rejected": 0.45717653632164, "step": 6834 }, { "epoch": 3.69, "learning_rate": 1.6011802418977094e-09, "logits/chosen": -2.1263551712036133, "logits/rejected": -2.3843092918395996, "logps/chosen": -0.8007733225822449, "logps/rejected": -5.006962776184082, "loss": 0.5969, "rewards/accuracies": 1.0, "rewards/chosen": 0.9856521487236023, "rewards/margins": 0.20278501510620117, "rewards/rejected": 0.7828671336174011, "step": 6835 }, { "epoch": 3.69, "learning_rate": 1.595702658358483e-09, "logits/chosen": -2.1371145248413086, "logits/rejected": -2.251396656036377, "logps/chosen": -2.02717924118042, "logps/rejected": -1.8361319303512573, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.9747499823570251, "rewards/margins": 0.02204287052154541, "rewards/rejected": 0.9527071118354797, "step": 6836 }, { "epoch": 3.69, "learning_rate": 1.5902343082593917e-09, "logits/chosen": -2.0775604248046875, "logits/rejected": -2.080343008041382, "logps/chosen": -4.156271457672119, "logps/rejected": -1.6756116151809692, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 1.445986032485962, "rewards/margins": 0.5090306401252747, "rewards/rejected": 0.9369553923606873, "step": 6837 }, { "epoch": 3.69, "learning_rate": 1.5847751926435616e-09, "logits/chosen": -2.033512830734253, "logits/rejected": -2.044394016265869, "logps/chosen": -1.2224622964859009, "logps/rejected": -3.0769712924957275, "loss": 0.3974, "rewards/accuracies": 1.0, "rewards/chosen": 1.3128162622451782, "rewards/margins": 0.7173921465873718, "rewards/rejected": 0.5954241156578064, "step": 6838 }, { "epoch": 3.69, "learning_rate": 1.5793253125523598e-09, "logits/chosen": -2.154010534286499, "logits/rejected": -2.144975423812866, "logps/chosen": -2.8597631454467773, "logps/rejected": -5.394142150878906, "loss": 0.4726, "rewards/accuracies": 1.0, "rewards/chosen": 1.415934681892395, "rewards/margins": 0.5040461421012878, "rewards/rejected": 0.9118885397911072, "step": 6839 }, { "epoch": 3.69, "learning_rate": 1.5738846690253882e-09, "logits/chosen": -2.1004745960235596, "logits/rejected": -2.0794906616210938, "logps/chosen": -7.2326459884643555, "logps/rejected": -1.813511610031128, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 1.7691322565078735, "rewards/margins": 0.944492518901825, "rewards/rejected": 0.8246397376060486, "step": 6840 }, { "epoch": 3.69, "learning_rate": 1.5684532631004776e-09, "logits/chosen": -1.9810845851898193, "logits/rejected": -2.2612602710723877, "logps/chosen": -0.12658251821994781, "logps/rejected": -0.11458922177553177, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.9530040621757507, "rewards/margins": 0.017339587211608887, "rewards/rejected": 0.9356644749641418, "step": 6841 }, { "epoch": 3.69, "learning_rate": 1.563031095813705e-09, "logits/chosen": -2.112999439239502, "logits/rejected": -2.0128636360168457, "logps/chosen": -4.312413215637207, "logps/rejected": -5.364898204803467, "loss": 0.3084, "rewards/accuracies": 1.0, "rewards/chosen": 1.7582828998565674, "rewards/margins": 1.018261432647705, "rewards/rejected": 0.7400214076042175, "step": 6842 }, { "epoch": 3.69, "learning_rate": 1.5576181681993928e-09, "logits/chosen": -2.030299425125122, "logits/rejected": -2.0101234912872314, "logps/chosen": -9.848747253417969, "logps/rejected": -11.274394989013672, "loss": 0.4806, "rewards/accuracies": 1.0, "rewards/chosen": 1.2103697061538696, "rewards/margins": 0.48285752534866333, "rewards/rejected": 0.7275121808052063, "step": 6843 }, { "epoch": 3.69, "learning_rate": 1.5522144812900873e-09, "logits/chosen": -2.044663667678833, "logits/rejected": -2.317302703857422, "logps/chosen": -0.5609244108200073, "logps/rejected": -0.6005982756614685, "loss": 0.6662, "rewards/accuracies": 1.0, "rewards/chosen": 1.0022518634796143, "rewards/margins": 0.05456411838531494, "rewards/rejected": 0.9476877450942993, "step": 6844 }, { "epoch": 3.69, "learning_rate": 1.546820036116575e-09, "logits/chosen": -2.038926839828491, "logits/rejected": -2.0426273345947266, "logps/chosen": -3.258167266845703, "logps/rejected": -3.4607560634613037, "loss": 0.4131, "rewards/accuracies": 1.0, "rewards/chosen": 1.3243588209152222, "rewards/margins": 0.6704151034355164, "rewards/rejected": 0.6539437174797058, "step": 6845 }, { "epoch": 3.69, "learning_rate": 1.5414348337078887e-09, "logits/chosen": -2.0331006050109863, "logits/rejected": -2.329716920852661, "logps/chosen": -1.5736724138259888, "logps/rejected": -0.30764904618263245, "loss": 0.778, "rewards/accuracies": 0.0, "rewards/chosen": 0.9277896881103516, "rewards/margins": -0.16310036182403564, "rewards/rejected": 1.0908900499343872, "step": 6846 }, { "epoch": 3.69, "learning_rate": 1.5360588750912895e-09, "logits/chosen": -2.218398094177246, "logits/rejected": -2.221374750137329, "logps/chosen": -2.2402188777923584, "logps/rejected": -1.1066821813583374, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9881941080093384, "rewards/margins": 0.22968661785125732, "rewards/rejected": 0.758507490158081, "step": 6847 }, { "epoch": 3.69, "learning_rate": 1.530692161292274e-09, "logits/chosen": -2.0929458141326904, "logits/rejected": -2.29632306098938, "logps/chosen": -4.412969589233398, "logps/rejected": -0.46355849504470825, "loss": 0.7465, "rewards/accuracies": 0.0, "rewards/chosen": 0.7564611434936523, "rewards/margins": -0.1040792465209961, "rewards/rejected": 0.8605403900146484, "step": 6848 }, { "epoch": 3.69, "learning_rate": 1.5253346933345846e-09, "logits/chosen": -2.068584442138672, "logits/rejected": -2.2468678951263428, "logps/chosen": -3.0762908458709717, "logps/rejected": -3.3006036281585693, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.7715235948562622, "rewards/margins": 0.004896640777587891, "rewards/rejected": 0.7666269540786743, "step": 6849 }, { "epoch": 3.69, "learning_rate": 1.5199864722401755e-09, "logits/chosen": -2.075279951095581, "logits/rejected": -2.0724027156829834, "logps/chosen": -2.165386199951172, "logps/rejected": -6.475521087646484, "loss": 0.2533, "rewards/accuracies": 1.0, "rewards/chosen": 1.5372761487960815, "rewards/margins": 1.2439579963684082, "rewards/rejected": 0.2933181822299957, "step": 6850 }, { "epoch": 3.7, "learning_rate": 1.5146474990292812e-09, "logits/chosen": -2.073373317718506, "logits/rejected": -2.2833263874053955, "logps/chosen": -0.23351411521434784, "logps/rejected": -0.2679855525493622, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9076699614524841, "rewards/margins": 0.0186617374420166, "rewards/rejected": 0.8890082240104675, "step": 6851 }, { "epoch": 3.7, "learning_rate": 1.509317774720331e-09, "logits/chosen": -2.0649046897888184, "logits/rejected": -2.0660104751586914, "logps/chosen": -4.976677894592285, "logps/rejected": -9.232019424438477, "loss": 0.2759, "rewards/accuracies": 1.0, "rewards/chosen": 1.309079885482788, "rewards/margins": 1.1465210914611816, "rewards/rejected": 0.16255883872509003, "step": 6852 }, { "epoch": 3.7, "learning_rate": 1.5039973003300121e-09, "logits/chosen": -2.0059938430786133, "logits/rejected": -2.278597354888916, "logps/chosen": -0.49717697501182556, "logps/rejected": -0.5588474273681641, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 1.1379848718643188, "rewards/margins": 0.008189916610717773, "rewards/rejected": 1.129794955253601, "step": 6853 }, { "epoch": 3.7, "learning_rate": 1.4986860768732402e-09, "logits/chosen": -2.0237133502960205, "logits/rejected": -2.2626595497131348, "logps/chosen": -1.2743723392486572, "logps/rejected": -1.2825255393981934, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 1.0091503858566284, "rewards/margins": -0.007837057113647461, "rewards/rejected": 1.0169874429702759, "step": 6854 }, { "epoch": 3.7, "learning_rate": 1.4933841053631547e-09, "logits/chosen": -2.133297920227051, "logits/rejected": -2.3106439113616943, "logps/chosen": -0.3118649125099182, "logps/rejected": -0.3115364611148834, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.988563060760498, "rewards/margins": 0.010990619659423828, "rewards/rejected": 0.9775724411010742, "step": 6855 }, { "epoch": 3.7, "learning_rate": 1.4880913868111521e-09, "logits/chosen": -2.0043227672576904, "logits/rejected": -2.321000814437866, "logps/chosen": -2.014167308807373, "logps/rejected": -9.571457862854004, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.9288238883018494, "rewards/margins": 0.007441043853759766, "rewards/rejected": 0.9213828444480896, "step": 6856 }, { "epoch": 3.7, "learning_rate": 1.4828079222268476e-09, "logits/chosen": -2.1012537479400635, "logits/rejected": -2.1027255058288574, "logps/chosen": -1.6026959419250488, "logps/rejected": -1.1565067768096924, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 1.4009037017822266, "rewards/margins": 0.3845478296279907, "rewards/rejected": 1.0163558721542358, "step": 6857 }, { "epoch": 3.7, "learning_rate": 1.4775337126181008e-09, "logits/chosen": -2.2145204544067383, "logits/rejected": -2.213679313659668, "logps/chosen": -2.1158597469329834, "logps/rejected": -5.709885597229004, "loss": 0.4323, "rewards/accuracies": 1.0, "rewards/chosen": 1.0561530590057373, "rewards/margins": 0.6148166060447693, "rewards/rejected": 0.441336452960968, "step": 6858 }, { "epoch": 3.7, "learning_rate": 1.4722687589910022e-09, "logits/chosen": -2.055478096008301, "logits/rejected": -2.0552263259887695, "logps/chosen": -1.4139127731323242, "logps/rejected": -4.716014862060547, "loss": 0.5665, "rewards/accuracies": 1.0, "rewards/chosen": 1.1861876249313354, "rewards/margins": 0.2716705799102783, "rewards/rejected": 0.9145170450210571, "step": 6859 }, { "epoch": 3.7, "learning_rate": 1.467013062349881e-09, "logits/chosen": -2.023346424102783, "logits/rejected": -2.2932283878326416, "logps/chosen": -0.7307702898979187, "logps/rejected": -0.7980894446372986, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.8710941672325134, "rewards/margins": 0.003860771656036377, "rewards/rejected": 0.867233395576477, "step": 6860 }, { "epoch": 3.7, "learning_rate": 1.4617666236972803e-09, "logits/chosen": -2.2260921001434326, "logits/rejected": -2.2744204998016357, "logps/chosen": -0.8465691804885864, "logps/rejected": -0.9166786074638367, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9396333694458008, "rewards/margins": 0.022174179553985596, "rewards/rejected": 0.9174591898918152, "step": 6861 }, { "epoch": 3.7, "learning_rate": 1.4565294440340103e-09, "logits/chosen": -2.0240612030029297, "logits/rejected": -2.314371347427368, "logps/chosen": -0.598983108997345, "logps/rejected": -0.6509303450584412, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 1.054067850112915, "rewards/margins": 0.0036286115646362305, "rewards/rejected": 1.0504392385482788, "step": 6862 }, { "epoch": 3.7, "learning_rate": 1.4513015243590943e-09, "logits/chosen": -2.0870063304901123, "logits/rejected": -2.3716976642608643, "logps/chosen": -1.9938466548919678, "logps/rejected": -2.339461326599121, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9013811945915222, "rewards/margins": 0.015115022659301758, "rewards/rejected": 0.8862661719322205, "step": 6863 }, { "epoch": 3.7, "learning_rate": 1.4460828656697844e-09, "logits/chosen": -2.192323923110962, "logits/rejected": -2.0926198959350586, "logps/chosen": -17.582761764526367, "logps/rejected": -4.953031063079834, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": 2.638576030731201, "rewards/margins": 2.2736403942108154, "rewards/rejected": 0.3649355471134186, "step": 6864 }, { "epoch": 3.7, "learning_rate": 1.4408734689615843e-09, "logits/chosen": -2.139482259750366, "logits/rejected": -2.359632730484009, "logps/chosen": -0.23224222660064697, "logps/rejected": -0.23415586352348328, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.8658876419067383, "rewards/margins": 0.029837429523468018, "rewards/rejected": 0.8360502123832703, "step": 6865 }, { "epoch": 3.7, "learning_rate": 1.4356733352282102e-09, "logits/chosen": -1.9864624738693237, "logits/rejected": -2.2862532138824463, "logps/chosen": -0.5870745182037354, "logps/rejected": -0.584023118019104, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9206782579421997, "rewards/margins": 0.02011948823928833, "rewards/rejected": 0.9005587697029114, "step": 6866 }, { "epoch": 3.7, "learning_rate": 1.4304824654616354e-09, "logits/chosen": -2.1167855262756348, "logits/rejected": -2.1330313682556152, "logps/chosen": -3.0192549228668213, "logps/rejected": -5.001317024230957, "loss": 0.2514, "rewards/accuracies": 1.0, "rewards/chosen": 1.9262348413467407, "rewards/margins": 1.2524635791778564, "rewards/rejected": 0.6737712025642395, "step": 6867 }, { "epoch": 3.7, "learning_rate": 1.42530086065204e-09, "logits/chosen": -2.1657121181488037, "logits/rejected": -2.15946626663208, "logps/chosen": -6.7609148025512695, "logps/rejected": -6.208563804626465, "loss": 0.3577, "rewards/accuracies": 1.0, "rewards/chosen": 1.1191798448562622, "rewards/margins": 0.843985378742218, "rewards/rejected": 0.2751944661140442, "step": 6868 }, { "epoch": 3.7, "learning_rate": 1.4201285217878555e-09, "logits/chosen": -1.9496217966079712, "logits/rejected": -1.948913335800171, "logps/chosen": -0.35019272565841675, "logps/rejected": -2.418405532836914, "loss": 0.6313, "rewards/accuracies": 1.0, "rewards/chosen": 0.8935920596122742, "rewards/margins": 0.12773370742797852, "rewards/rejected": 0.7658583521842957, "step": 6869 }, { "epoch": 3.71, "learning_rate": 1.4149654498557373e-09, "logits/chosen": -2.0775845050811768, "logits/rejected": -2.314760208129883, "logps/chosen": -0.2059003859758377, "logps/rejected": -0.25527459383010864, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8629889488220215, "rewards/margins": 0.011153578758239746, "rewards/rejected": 0.8518353700637817, "step": 6870 }, { "epoch": 3.71, "learning_rate": 1.4098116458405862e-09, "logits/chosen": -2.1199951171875, "logits/rejected": -2.316387414932251, "logps/chosen": -9.673986434936523, "logps/rejected": -5.611270427703857, "loss": 0.5765, "rewards/accuracies": 1.0, "rewards/chosen": 1.1206684112548828, "rewards/margins": 0.24863356351852417, "rewards/rejected": 0.8720348477363586, "step": 6871 }, { "epoch": 3.71, "learning_rate": 1.4046671107255048e-09, "logits/chosen": -2.176053285598755, "logits/rejected": -2.1679985523223877, "logps/chosen": -7.5340352058410645, "logps/rejected": -0.8283119201660156, "loss": 0.4606, "rewards/accuracies": 1.0, "rewards/chosen": 1.4972690343856812, "rewards/margins": 0.5361813306808472, "rewards/rejected": 0.961087703704834, "step": 6872 }, { "epoch": 3.71, "learning_rate": 1.399531845491858e-09, "logits/chosen": -2.2548227310180664, "logits/rejected": -2.219248056411743, "logps/chosen": -20.35159683227539, "logps/rejected": -23.177940368652344, "loss": 0.4856, "rewards/accuracies": 1.0, "rewards/chosen": 1.8799076080322266, "rewards/margins": 0.469889760017395, "rewards/rejected": 1.4100178480148315, "step": 6873 }, { "epoch": 3.71, "learning_rate": 1.3944058511192348e-09, "logits/chosen": -2.1783812046051025, "logits/rejected": -2.3664791584014893, "logps/chosen": -1.2313154935836792, "logps/rejected": -1.2213276624679565, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 1.2004145383834839, "rewards/margins": 0.0370326042175293, "rewards/rejected": 1.1633819341659546, "step": 6874 }, { "epoch": 3.71, "learning_rate": 1.3892891285854468e-09, "logits/chosen": -2.0162830352783203, "logits/rejected": -2.0154120922088623, "logps/chosen": -0.5302906632423401, "logps/rejected": -1.9784804582595825, "loss": 0.5711, "rewards/accuracies": 1.0, "rewards/chosen": 1.0534192323684692, "rewards/margins": 0.2610549330711365, "rewards/rejected": 0.7923642992973328, "step": 6875 }, { "epoch": 3.71, "learning_rate": 1.3841816788665472e-09, "logits/chosen": -2.1200380325317383, "logits/rejected": -2.1298933029174805, "logps/chosen": -2.7186014652252197, "logps/rejected": -6.202631950378418, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 1.0220458507537842, "rewards/margins": 0.03767305612564087, "rewards/rejected": 0.9843727946281433, "step": 6876 }, { "epoch": 3.71, "learning_rate": 1.3790835029368175e-09, "logits/chosen": -2.102475166320801, "logits/rejected": -2.206738233566284, "logps/chosen": -0.7908041477203369, "logps/rejected": -26.19840431213379, "loss": 0.2609, "rewards/accuracies": 1.0, "rewards/chosen": 1.1939749717712402, "rewards/margins": 1.210491418838501, "rewards/rejected": -0.016516495496034622, "step": 6877 }, { "epoch": 3.71, "learning_rate": 1.3739946017687576e-09, "logits/chosen": -2.093116044998169, "logits/rejected": -2.248833656311035, "logps/chosen": -6.274057388305664, "logps/rejected": -4.440905570983887, "loss": 0.5838, "rewards/accuracies": 1.0, "rewards/chosen": 1.1302458047866821, "rewards/margins": 0.23208707571029663, "rewards/rejected": 0.8981587290763855, "step": 6878 }, { "epoch": 3.71, "learning_rate": 1.3689149763331187e-09, "logits/chosen": -2.1317129135131836, "logits/rejected": -2.282885789871216, "logps/chosen": -0.14085158705711365, "logps/rejected": -0.18618398904800415, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8459741473197937, "rewards/margins": 0.022402644157409668, "rewards/rejected": 0.823571503162384, "step": 6879 }, { "epoch": 3.71, "learning_rate": 1.3638446275988701e-09, "logits/chosen": -2.078092336654663, "logits/rejected": -2.351926803588867, "logps/chosen": -0.8007450699806213, "logps/rejected": -0.9024149179458618, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.9944672584533691, "rewards/margins": -0.0008511543273925781, "rewards/rejected": 0.9953184127807617, "step": 6880 }, { "epoch": 3.71, "learning_rate": 1.3587835565332162e-09, "logits/chosen": -2.228332042694092, "logits/rejected": -2.1921193599700928, "logps/chosen": -12.363773345947266, "logps/rejected": -3.138942241668701, "loss": 0.4449, "rewards/accuracies": 1.0, "rewards/chosen": 1.3066291809082031, "rewards/margins": 0.5791333317756653, "rewards/rejected": 0.7274958491325378, "step": 6881 }, { "epoch": 3.71, "learning_rate": 1.35373176410159e-09, "logits/chosen": -2.1560494899749756, "logits/rejected": -2.155658483505249, "logps/chosen": -4.00375509262085, "logps/rejected": -2.485344886779785, "loss": 0.2705, "rewards/accuracies": 1.0, "rewards/chosen": 1.7044498920440674, "rewards/margins": 1.1692297458648682, "rewards/rejected": 0.5352201461791992, "step": 6882 }, { "epoch": 3.71, "learning_rate": 1.3486892512676485e-09, "logits/chosen": -2.0898184776306152, "logits/rejected": -2.082615375518799, "logps/chosen": -11.33491325378418, "logps/rejected": -5.929596900939941, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 1.2797441482543945, "rewards/margins": 0.5982138514518738, "rewards/rejected": 0.6815302968025208, "step": 6883 }, { "epoch": 3.71, "learning_rate": 1.3436560189932945e-09, "logits/chosen": -2.0592763423919678, "logits/rejected": -2.3271896839141846, "logps/chosen": -0.44790658354759216, "logps/rejected": -0.5999669432640076, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.9569965600967407, "rewards/margins": 0.0004233717918395996, "rewards/rejected": 0.9565731883049011, "step": 6884 }, { "epoch": 3.71, "learning_rate": 1.3386320682386377e-09, "logits/chosen": -2.1627657413482666, "logits/rejected": -2.1663315296173096, "logps/chosen": -1.8231618404388428, "logps/rejected": -3.343369245529175, "loss": 0.4796, "rewards/accuracies": 1.0, "rewards/chosen": 1.0562347173690796, "rewards/margins": 0.4854263663291931, "rewards/rejected": 0.5708083510398865, "step": 6885 }, { "epoch": 3.71, "learning_rate": 1.3336173999620392e-09, "logits/chosen": -2.126067876815796, "logits/rejected": -2.301271677017212, "logps/chosen": -0.768672525882721, "logps/rejected": -0.6530389189720154, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.9056615829467773, "rewards/margins": 0.004596054553985596, "rewards/rejected": 0.9010655283927917, "step": 6886 }, { "epoch": 3.71, "learning_rate": 1.3286120151200785e-09, "logits/chosen": -2.0696933269500732, "logits/rejected": -2.062634229660034, "logps/chosen": -3.568699359893799, "logps/rejected": -1.2480833530426025, "loss": 0.2203, "rewards/accuracies": 1.0, "rewards/chosen": 2.27736759185791, "rewards/margins": 1.4007537364959717, "rewards/rejected": 0.8766137957572937, "step": 6887 }, { "epoch": 3.72, "learning_rate": 1.323615914667564e-09, "logits/chosen": -2.061350107192993, "logits/rejected": -2.066002130508423, "logps/chosen": -1.7988612651824951, "logps/rejected": -4.8100080490112305, "loss": 0.4332, "rewards/accuracies": 1.0, "rewards/chosen": 1.2019320726394653, "rewards/margins": 0.6120988726615906, "rewards/rejected": 0.5898331999778748, "step": 6888 }, { "epoch": 3.72, "learning_rate": 1.318629099557539e-09, "logits/chosen": -2.0277259349823, "logits/rejected": -2.300454616546631, "logps/chosen": -0.5823327302932739, "logps/rejected": -0.5536450743675232, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 1.0045409202575684, "rewards/margins": 0.01628398895263672, "rewards/rejected": 0.9882569313049316, "step": 6889 }, { "epoch": 3.72, "learning_rate": 1.3136515707412643e-09, "logits/chosen": -2.0907986164093018, "logits/rejected": -2.0915355682373047, "logps/chosen": -2.527477979660034, "logps/rejected": -2.5536677837371826, "loss": 0.3507, "rewards/accuracies": 1.0, "rewards/chosen": 1.7832386493682861, "rewards/margins": 0.867414116859436, "rewards/rejected": 0.9158245325088501, "step": 6890 }, { "epoch": 3.72, "learning_rate": 1.308683329168242e-09, "logits/chosen": -2.1143851280212402, "logits/rejected": -2.293947219848633, "logps/chosen": -0.48560646176338196, "logps/rejected": -5.197803974151611, "loss": 0.7077, "rewards/accuracies": 0.0, "rewards/chosen": 0.7591001391410828, "rewards/margins": -0.028961360454559326, "rewards/rejected": 0.7880614995956421, "step": 6891 }, { "epoch": 3.72, "learning_rate": 1.3037243757861972e-09, "logits/chosen": -1.987771987915039, "logits/rejected": -1.999699354171753, "logps/chosen": -3.0258781909942627, "logps/rejected": -4.819643974304199, "loss": 0.4103, "rewards/accuracies": 1.0, "rewards/chosen": 1.0850423574447632, "rewards/margins": 0.6788086891174316, "rewards/rejected": 0.40623369812965393, "step": 6892 }, { "epoch": 3.72, "learning_rate": 1.2987747115410786e-09, "logits/chosen": -2.1731228828430176, "logits/rejected": -2.2940924167633057, "logps/chosen": -0.3743111491203308, "logps/rejected": -0.38017839193344116, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 1.0013459920883179, "rewards/margins": -0.005820751190185547, "rewards/rejected": 1.0071667432785034, "step": 6893 }, { "epoch": 3.72, "learning_rate": 1.2938343373770754e-09, "logits/chosen": -2.086728811264038, "logits/rejected": -2.073997735977173, "logps/chosen": -0.2380187213420868, "logps/rejected": -7.6971282958984375, "loss": 0.4142, "rewards/accuracies": 1.0, "rewards/chosen": 0.9352334141731262, "rewards/margins": 0.6672887802124023, "rewards/rejected": 0.2679446339607239, "step": 6894 }, { "epoch": 3.72, "learning_rate": 1.2889032542365841e-09, "logits/chosen": -2.0589029788970947, "logits/rejected": -2.0537922382354736, "logps/chosen": -3.1294541358947754, "logps/rejected": -4.52932596206665, "loss": 0.3645, "rewards/accuracies": 1.0, "rewards/chosen": 1.4075539112091064, "rewards/margins": 0.8215695023536682, "rewards/rejected": 0.5859844088554382, "step": 6895 }, { "epoch": 3.72, "learning_rate": 1.283981463060252e-09, "logits/chosen": -2.148068428039551, "logits/rejected": -2.3850862979888916, "logps/chosen": -2.774491786956787, "logps/rejected": -2.5085737705230713, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": 0.9918712973594666, "rewards/margins": -0.01571863889694214, "rewards/rejected": 1.0075899362564087, "step": 6896 }, { "epoch": 3.72, "learning_rate": 1.2790689647869446e-09, "logits/chosen": -1.9434391260147095, "logits/rejected": -1.951209545135498, "logps/chosen": -3.1569576263427734, "logps/rejected": -5.062431335449219, "loss": 0.4923, "rewards/accuracies": 1.0, "rewards/chosen": 0.9960344433784485, "rewards/margins": 0.4523288607597351, "rewards/rejected": 0.5437055826187134, "step": 6897 }, { "epoch": 3.72, "learning_rate": 1.274165760353746e-09, "logits/chosen": -2.141404867172241, "logits/rejected": -2.277484893798828, "logps/chosen": -0.7621688842773438, "logps/rejected": -0.7911019325256348, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.0868782997131348, "rewards/margins": 0.016886234283447266, "rewards/rejected": 1.0699920654296875, "step": 6898 }, { "epoch": 3.72, "learning_rate": 1.26927185069598e-09, "logits/chosen": -2.0075831413269043, "logits/rejected": -2.2569055557250977, "logps/chosen": -3.1939845085144043, "logps/rejected": -0.3435887098312378, "loss": 0.7414, "rewards/accuracies": 0.0, "rewards/chosen": 0.8852642178535461, "rewards/margins": -0.0942835807800293, "rewards/rejected": 0.9795477986335754, "step": 6899 }, { "epoch": 3.72, "learning_rate": 1.2643872367471887e-09, "logits/chosen": -2.159952163696289, "logits/rejected": -2.2080042362213135, "logps/chosen": -2.9073028564453125, "logps/rejected": -12.592042922973633, "loss": 0.2511, "rewards/accuracies": 1.0, "rewards/chosen": 1.7335224151611328, "rewards/margins": 1.253706932067871, "rewards/rejected": 0.4798154830932617, "step": 6900 }, { "epoch": 3.72, "learning_rate": 1.2595119194391546e-09, "logits/chosen": -2.0100889205932617, "logits/rejected": -2.010370969772339, "logps/chosen": -1.7106143236160278, "logps/rejected": -0.955890417098999, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 1.0415328741073608, "rewards/margins": -0.010398268699645996, "rewards/rejected": 1.0519311428070068, "step": 6901 }, { "epoch": 3.72, "learning_rate": 1.2546458997018617e-09, "logits/chosen": -2.1362314224243164, "logits/rejected": -2.1378934383392334, "logps/chosen": -2.9639341831207275, "logps/rejected": -7.527310848236084, "loss": 0.4825, "rewards/accuracies": 1.0, "rewards/chosen": 1.3242990970611572, "rewards/margins": 0.4777332544326782, "rewards/rejected": 0.846565842628479, "step": 6902 }, { "epoch": 3.72, "learning_rate": 1.2497891784635395e-09, "logits/chosen": -2.0941505432128906, "logits/rejected": -2.0987985134124756, "logps/chosen": -0.3195503056049347, "logps/rejected": -7.142451286315918, "loss": 0.4714, "rewards/accuracies": 1.0, "rewards/chosen": 0.9495053291320801, "rewards/margins": 0.5072345733642578, "rewards/rejected": 0.44227075576782227, "step": 6903 }, { "epoch": 3.72, "learning_rate": 1.244941756650647e-09, "logits/chosen": -2.157252550125122, "logits/rejected": -2.1521103382110596, "logps/chosen": -3.4711506366729736, "logps/rejected": -2.3817412853240967, "loss": 0.5002, "rewards/accuracies": 1.0, "rewards/chosen": 1.142301321029663, "rewards/margins": 0.4322301149368286, "rewards/rejected": 0.7100712060928345, "step": 6904 }, { "epoch": 3.72, "learning_rate": 1.2401036351878559e-09, "logits/chosen": -2.0450832843780518, "logits/rejected": -2.04764461517334, "logps/chosen": -2.5033297538757324, "logps/rejected": -0.6251569390296936, "loss": 0.6304, "rewards/accuracies": 1.0, "rewards/chosen": 1.1072005033493042, "rewards/margins": 0.1297937035560608, "rewards/rejected": 0.9774067997932434, "step": 6905 }, { "epoch": 3.72, "learning_rate": 1.2352748149980663e-09, "logits/chosen": -2.078791856765747, "logits/rejected": -2.07431697845459, "logps/chosen": -9.363408088684082, "logps/rejected": -5.860191822052002, "loss": 0.2345, "rewards/accuracies": 1.0, "rewards/chosen": 1.9058071374893188, "rewards/margins": 1.330932378768921, "rewards/rejected": 0.5748748183250427, "step": 6906 }, { "epoch": 3.73, "learning_rate": 1.230455297002414e-09, "logits/chosen": -2.0976526737213135, "logits/rejected": -2.302380084991455, "logps/chosen": -1.151222825050354, "logps/rejected": -1.0649391412734985, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 1.0443835258483887, "rewards/margins": 0.02303469181060791, "rewards/rejected": 1.0213488340377808, "step": 6907 }, { "epoch": 3.73, "learning_rate": 1.2256450821202579e-09, "logits/chosen": -2.1349196434020996, "logits/rejected": -2.133258819580078, "logps/chosen": -1.1595008373260498, "logps/rejected": -3.279829502105713, "loss": 0.5319, "rewards/accuracies": 1.0, "rewards/chosen": 0.915882408618927, "rewards/margins": 0.3535730838775635, "rewards/rejected": 0.5623093247413635, "step": 6908 }, { "epoch": 3.73, "learning_rate": 1.2208441712691697e-09, "logits/chosen": -2.172023296356201, "logits/rejected": -2.2091128826141357, "logps/chosen": -0.5875562429428101, "logps/rejected": -7.264588356018066, "loss": 0.7147, "rewards/accuracies": 0.0, "rewards/chosen": 1.0166305303573608, "rewards/margins": -0.042710065841674805, "rewards/rejected": 1.0593405961990356, "step": 6909 }, { "epoch": 3.73, "learning_rate": 1.216052565364961e-09, "logits/chosen": -2.0296809673309326, "logits/rejected": -2.3389649391174316, "logps/chosen": -0.85016268491745, "logps/rejected": -0.9275772571563721, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.877846896648407, "rewards/margins": 0.005613088607788086, "rewards/rejected": 0.8722338080406189, "step": 6910 }, { "epoch": 3.73, "learning_rate": 1.2112702653216566e-09, "logits/chosen": -2.193676233291626, "logits/rejected": -2.1280791759490967, "logps/chosen": -16.231666564941406, "logps/rejected": -9.481114387512207, "loss": 0.181, "rewards/accuracies": 1.0, "rewards/chosen": 2.0746116638183594, "rewards/margins": 1.6174960136413574, "rewards/rejected": 0.45711565017700195, "step": 6911 }, { "epoch": 3.73, "learning_rate": 1.206497272051521e-09, "logits/chosen": -2.1434431076049805, "logits/rejected": -2.323409080505371, "logps/chosen": -0.19922059774398804, "logps/rejected": -0.21573741734027863, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.8173381686210632, "rewards/margins": -0.009517669677734375, "rewards/rejected": 0.8268558382987976, "step": 6912 }, { "epoch": 3.73, "learning_rate": 1.2017335864650203e-09, "logits/chosen": -2.072042226791382, "logits/rejected": -2.3190109729766846, "logps/chosen": -0.4448263347148895, "logps/rejected": -0.44837427139282227, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9342714548110962, "rewards/margins": 0.023258209228515625, "rewards/rejected": 0.9110132455825806, "step": 6913 }, { "epoch": 3.73, "learning_rate": 1.196979209470872e-09, "logits/chosen": -2.147770404815674, "logits/rejected": -2.2908427715301514, "logps/chosen": -2.521181106567383, "logps/rejected": -2.5812549591064453, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 1.0595790147781372, "rewards/margins": 0.014297008514404297, "rewards/rejected": 1.045282006263733, "step": 6914 }, { "epoch": 3.73, "learning_rate": 1.1922341419760006e-09, "logits/chosen": -2.0644891262054443, "logits/rejected": -2.065624713897705, "logps/chosen": -1.0333563089370728, "logps/rejected": -3.508033514022827, "loss": 0.4559, "rewards/accuracies": 1.0, "rewards/chosen": 1.0891538858413696, "rewards/margins": 0.5489749908447266, "rewards/rejected": 0.5401788949966431, "step": 6915 }, { "epoch": 3.73, "learning_rate": 1.1874983848855602e-09, "logits/chosen": -2.0635342597961426, "logits/rejected": -2.228440523147583, "logps/chosen": -0.8508729934692383, "logps/rejected": -2.0316762924194336, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 1.0583547353744507, "rewards/margins": 0.008185863494873047, "rewards/rejected": 1.0501688718795776, "step": 6916 }, { "epoch": 3.73, "learning_rate": 1.1827719391029222e-09, "logits/chosen": -2.1177384853363037, "logits/rejected": -2.341963768005371, "logps/chosen": -0.9930245876312256, "logps/rejected": -0.8390834927558899, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 1.1054755449295044, "rewards/margins": -0.012373924255371094, "rewards/rejected": 1.1178494691848755, "step": 6917 }, { "epoch": 3.73, "learning_rate": 1.1780548055296935e-09, "logits/chosen": -2.08377742767334, "logits/rejected": -2.0859572887420654, "logps/chosen": -0.28023019433021545, "logps/rejected": -5.691891670227051, "loss": 0.4177, "rewards/accuracies": 1.0, "rewards/chosen": 1.076346755027771, "rewards/margins": 0.6568806171417236, "rewards/rejected": 0.419466108083725, "step": 6918 }, { "epoch": 3.73, "learning_rate": 1.173346985065693e-09, "logits/chosen": -2.0243895053863525, "logits/rejected": -2.2876060009002686, "logps/chosen": -0.9894979596138, "logps/rejected": -0.9907900094985962, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.9414955973625183, "rewards/margins": -0.003348827362060547, "rewards/rejected": 0.9448444247245789, "step": 6919 }, { "epoch": 3.73, "learning_rate": 1.1686484786089746e-09, "logits/chosen": -2.0710270404815674, "logits/rejected": -2.0695369243621826, "logps/chosen": -0.5795634984970093, "logps/rejected": -2.1882693767547607, "loss": 0.5771, "rewards/accuracies": 1.0, "rewards/chosen": 1.0328929424285889, "rewards/margins": 0.24729782342910767, "rewards/rejected": 0.7855951189994812, "step": 6920 }, { "epoch": 3.73, "learning_rate": 1.1639592870558046e-09, "logits/chosen": -2.112757444381714, "logits/rejected": -2.306922197341919, "logps/chosen": -8.360932350158691, "logps/rejected": -0.7935009598731995, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.9374427199363708, "rewards/margins": -0.007461130619049072, "rewards/rejected": 0.9449038505554199, "step": 6921 }, { "epoch": 3.73, "learning_rate": 1.1592794113006842e-09, "logits/chosen": -2.1127946376800537, "logits/rejected": -2.1207802295684814, "logps/chosen": -1.2146803140640259, "logps/rejected": -3.443580389022827, "loss": 0.4141, "rewards/accuracies": 1.0, "rewards/chosen": 1.1563166379928589, "rewards/margins": 0.6675220727920532, "rewards/rejected": 0.4887945353984833, "step": 6922 }, { "epoch": 3.73, "learning_rate": 1.1546088522363217e-09, "logits/chosen": -2.0164568424224854, "logits/rejected": -2.2794125080108643, "logps/chosen": -1.7963719367980957, "logps/rejected": -1.8582947254180908, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 1.2921589612960815, "rewards/margins": 0.027724146842956543, "rewards/rejected": 1.264434814453125, "step": 6923 }, { "epoch": 3.73, "learning_rate": 1.1499476107536654e-09, "logits/chosen": -1.9905937910079956, "logits/rejected": -1.9855321645736694, "logps/chosen": -5.96193790435791, "logps/rejected": -3.145159959793091, "loss": 0.2979, "rewards/accuracies": 1.0, "rewards/chosen": 1.615094780921936, "rewards/margins": 1.0582923889160156, "rewards/rejected": 0.5568023920059204, "step": 6924 }, { "epoch": 3.74, "learning_rate": 1.1452956877418763e-09, "logits/chosen": -2.0310497283935547, "logits/rejected": -2.2888829708099365, "logps/chosen": -5.561005592346191, "logps/rejected": -1.7263463735580444, "loss": 0.7642, "rewards/accuracies": 0.0, "rewards/chosen": 0.8896968960762024, "rewards/margins": -0.13744038343429565, "rewards/rejected": 1.027137279510498, "step": 6925 }, { "epoch": 3.74, "learning_rate": 1.1406530840883332e-09, "logits/chosen": -2.114562749862671, "logits/rejected": -2.098829984664917, "logps/chosen": -17.01669692993164, "logps/rejected": -4.839928150177002, "loss": 0.2582, "rewards/accuracies": 1.0, "rewards/chosen": 1.5439674854278564, "rewards/margins": 1.2223546504974365, "rewards/rejected": 0.32161280512809753, "step": 6926 }, { "epoch": 3.74, "learning_rate": 1.1360198006786447e-09, "logits/chosen": -2.167522668838501, "logits/rejected": -2.2693917751312256, "logps/chosen": -4.296781539916992, "logps/rejected": -4.468303203582764, "loss": 0.6406, "rewards/accuracies": 1.0, "rewards/chosen": 0.7451754808425903, "rewards/margins": 0.10799449682235718, "rewards/rejected": 0.6371809840202332, "step": 6927 }, { "epoch": 3.74, "learning_rate": 1.1313958383966426e-09, "logits/chosen": -2.2226529121398926, "logits/rejected": -2.2111775875091553, "logps/chosen": -6.4706549644470215, "logps/rejected": -3.9314706325531006, "loss": 0.4214, "rewards/accuracies": 1.0, "rewards/chosen": 1.364172339439392, "rewards/margins": 0.645950198173523, "rewards/rejected": 0.7182221412658691, "step": 6928 }, { "epoch": 3.74, "learning_rate": 1.126781198124388e-09, "logits/chosen": -2.099968433380127, "logits/rejected": -2.1125171184539795, "logps/chosen": -7.058412551879883, "logps/rejected": -8.12894344329834, "loss": 0.1778, "rewards/accuracies": 1.0, "rewards/chosen": 2.188809633255005, "rewards/margins": 1.6371045112609863, "rewards/rejected": 0.5517050623893738, "step": 6929 }, { "epoch": 3.74, "learning_rate": 1.1221758807421433e-09, "logits/chosen": -2.09706449508667, "logits/rejected": -2.239067792892456, "logps/chosen": -3.732041835784912, "logps/rejected": -0.25786158442497253, "loss": 0.6425, "rewards/accuracies": 1.0, "rewards/chosen": 0.8623750805854797, "rewards/margins": 0.10398650169372559, "rewards/rejected": 0.7583885788917542, "step": 6930 }, { "epoch": 3.74, "learning_rate": 1.1175798871284004e-09, "logits/chosen": -2.0908939838409424, "logits/rejected": -2.281428813934326, "logps/chosen": -0.2994260787963867, "logps/rejected": -0.3266541063785553, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.9557232856750488, "rewards/margins": 0.01049184799194336, "rewards/rejected": 0.9452314376831055, "step": 6931 }, { "epoch": 3.74, "learning_rate": 1.1129932181598856e-09, "logits/chosen": -2.1047091484069824, "logits/rejected": -2.1308274269104004, "logps/chosen": -16.766239166259766, "logps/rejected": -14.483861923217773, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": 2.5219883918762207, "rewards/margins": 2.161848306655884, "rewards/rejected": 0.3601400554180145, "step": 6932 }, { "epoch": 3.74, "learning_rate": 1.1084158747115269e-09, "logits/chosen": -2.142693519592285, "logits/rejected": -2.1409645080566406, "logps/chosen": -0.3038186728954315, "logps/rejected": -3.3563671112060547, "loss": 0.5852, "rewards/accuracies": 1.0, "rewards/chosen": 0.952183187007904, "rewards/margins": 0.22886919975280762, "rewards/rejected": 0.7233139872550964, "step": 6933 }, { "epoch": 3.74, "learning_rate": 1.103847857656487e-09, "logits/chosen": -2.141404390335083, "logits/rejected": -2.1734061241149902, "logps/chosen": -7.711452960968018, "logps/rejected": -16.720905303955078, "loss": 0.3687, "rewards/accuracies": 1.0, "rewards/chosen": 1.3438156843185425, "rewards/margins": 0.8077555298805237, "rewards/rejected": 0.5360601544380188, "step": 6934 }, { "epoch": 3.74, "learning_rate": 1.0992891678661464e-09, "logits/chosen": -2.16628098487854, "logits/rejected": -2.148092746734619, "logps/chosen": -11.291967391967773, "logps/rejected": -9.638832092285156, "loss": 0.2167, "rewards/accuracies": 1.0, "rewards/chosen": 1.4470280408859253, "rewards/margins": 1.4186912775039673, "rewards/rejected": 0.028336716815829277, "step": 6935 }, { "epoch": 3.74, "learning_rate": 1.0947398062101044e-09, "logits/chosen": -1.9762532711029053, "logits/rejected": -1.9761813879013062, "logps/chosen": -6.117918968200684, "logps/rejected": -3.8439886569976807, "loss": 0.3094, "rewards/accuracies": 1.0, "rewards/chosen": 1.5658423900604248, "rewards/margins": 1.014465093612671, "rewards/rejected": 0.5513772368431091, "step": 6936 }, { "epoch": 3.74, "learning_rate": 1.0901997735561775e-09, "logits/chosen": -2.1431074142456055, "logits/rejected": -2.300248622894287, "logps/chosen": -0.4349592626094818, "logps/rejected": -0.47235602140426636, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.9534344673156738, "rewards/margins": 0.004032135009765625, "rewards/rejected": 0.9494023323059082, "step": 6937 }, { "epoch": 3.74, "learning_rate": 1.0856690707704174e-09, "logits/chosen": -2.048586130142212, "logits/rejected": -2.039646625518799, "logps/chosen": -9.794878959655762, "logps/rejected": -5.286057949066162, "loss": 0.1787, "rewards/accuracies": 1.0, "rewards/chosen": 2.343088150024414, "rewards/margins": 1.6314644813537598, "rewards/rejected": 0.7116236090660095, "step": 6938 }, { "epoch": 3.74, "learning_rate": 1.0811476987170776e-09, "logits/chosen": -1.9813690185546875, "logits/rejected": -2.2912747859954834, "logps/chosen": -0.6193631291389465, "logps/rejected": -0.6802572011947632, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.7832152247428894, "rewards/margins": 0.014636337757110596, "rewards/rejected": 0.7685788869857788, "step": 6939 }, { "epoch": 3.74, "learning_rate": 1.0766356582586345e-09, "logits/chosen": -2.164304256439209, "logits/rejected": -2.0945048332214355, "logps/chosen": -26.099456787109375, "logps/rejected": -8.066737174987793, "loss": 0.1721, "rewards/accuracies": 1.0, "rewards/chosen": 2.5648250579833984, "rewards/margins": 1.6726856231689453, "rewards/rejected": 0.8921393752098083, "step": 6940 }, { "epoch": 3.74, "learning_rate": 1.0721329502557996e-09, "logits/chosen": -2.1405882835388184, "logits/rejected": -2.1410233974456787, "logps/chosen": -0.25107860565185547, "logps/rejected": -5.4388556480407715, "loss": 0.4558, "rewards/accuracies": 1.0, "rewards/chosen": 0.9856974482536316, "rewards/margins": 0.54906165599823, "rewards/rejected": 0.436635822057724, "step": 6941 }, { "epoch": 3.74, "learning_rate": 1.067639575567497e-09, "logits/chosen": -2.110450506210327, "logits/rejected": -2.233257293701172, "logps/chosen": -0.3081858456134796, "logps/rejected": -0.3219747245311737, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 0.9635782241821289, "rewards/margins": 0.039669692516326904, "rewards/rejected": 0.923908531665802, "step": 6942 }, { "epoch": 3.74, "learning_rate": 1.0631555350508581e-09, "logits/chosen": -2.07419753074646, "logits/rejected": -2.0738959312438965, "logps/chosen": -1.0981882810592651, "logps/rejected": -0.9003519415855408, "loss": 0.6396, "rewards/accuracies": 1.0, "rewards/chosen": 0.9772853851318359, "rewards/margins": 0.11009043455123901, "rewards/rejected": 0.8671949505805969, "step": 6943 }, { "epoch": 3.75, "learning_rate": 1.0586808295612538e-09, "logits/chosen": -1.9975956678390503, "logits/rejected": -2.2855143547058105, "logps/chosen": -0.4698425233364105, "logps/rejected": -0.3903738558292389, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 1.059924840927124, "rewards/margins": 0.018759608268737793, "rewards/rejected": 1.0411652326583862, "step": 6944 }, { "epoch": 3.75, "learning_rate": 1.0542154599522511e-09, "logits/chosen": -2.1437156200408936, "logits/rejected": -2.1424319744110107, "logps/chosen": -8.53410816192627, "logps/rejected": -2.1013972759246826, "loss": 0.4705, "rewards/accuracies": 1.0, "rewards/chosen": 1.3838218450546265, "rewards/margins": 0.5094347596168518, "rewards/rejected": 0.8743870854377747, "step": 6945 }, { "epoch": 3.75, "learning_rate": 1.0497594270756527e-09, "logits/chosen": -2.227201461791992, "logits/rejected": -2.3190951347351074, "logps/chosen": -0.43961143493652344, "logps/rejected": -0.3989178538322449, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 1.0530842542648315, "rewards/margins": 0.014184713363647461, "rewards/rejected": 1.038899540901184, "step": 6946 }, { "epoch": 3.75, "learning_rate": 1.0453127317814891e-09, "logits/chosen": -1.9804209470748901, "logits/rejected": -1.9326292276382446, "logps/chosen": -11.897558212280273, "logps/rejected": -11.34060001373291, "loss": 0.2182, "rewards/accuracies": 1.0, "rewards/chosen": 1.5238010883331299, "rewards/margins": 1.4110867977142334, "rewards/rejected": 0.11271429061889648, "step": 6947 }, { "epoch": 3.75, "learning_rate": 1.0408753749179821e-09, "logits/chosen": -2.24573016166687, "logits/rejected": -2.387908935546875, "logps/chosen": -0.16027411818504333, "logps/rejected": -0.14380067586898804, "loss": 0.6666, "rewards/accuracies": 1.0, "rewards/chosen": 0.8125330805778503, "rewards/margins": 0.05381512641906738, "rewards/rejected": 0.758717954158783, "step": 6948 }, { "epoch": 3.75, "learning_rate": 1.036447357331588e-09, "logits/chosen": -2.0987513065338135, "logits/rejected": -2.297358274459839, "logps/chosen": -0.8729150891304016, "logps/rejected": -0.7747243046760559, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.6103103756904602, "rewards/margins": 0.014224112033843994, "rewards/rejected": 0.5960862636566162, "step": 6949 }, { "epoch": 3.75, "learning_rate": 1.0320286798669864e-09, "logits/chosen": -2.1449859142303467, "logits/rejected": -2.2854135036468506, "logps/chosen": -0.6196244955062866, "logps/rejected": -0.678248405456543, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 1.025443434715271, "rewards/margins": -0.0034896135330200195, "rewards/rejected": 1.028933048248291, "step": 6950 }, { "epoch": 3.75, "learning_rate": 1.0276193433670587e-09, "logits/chosen": -2.1363492012023926, "logits/rejected": -2.309816598892212, "logps/chosen": -1.5218799114227295, "logps/rejected": -1.4211755990982056, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 1.106522798538208, "rewards/margins": 0.010329008102416992, "rewards/rejected": 1.096193790435791, "step": 6951 }, { "epoch": 3.75, "learning_rate": 1.0232193486729268e-09, "logits/chosen": -1.9493539333343506, "logits/rejected": -1.9574425220489502, "logps/chosen": -1.058029055595398, "logps/rejected": -3.6563870906829834, "loss": 0.4704, "rewards/accuracies": 1.0, "rewards/chosen": 1.0632470846176147, "rewards/margins": 0.5098809599876404, "rewards/rejected": 0.5533661246299744, "step": 6952 }, { "epoch": 3.75, "learning_rate": 1.018828696623919e-09, "logits/chosen": -2.2641003131866455, "logits/rejected": -2.3344669342041016, "logps/chosen": -0.8859930038452148, "logps/rejected": -0.9338095188140869, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 1.0665472745895386, "rewards/margins": -0.005561709403991699, "rewards/rejected": 1.0721089839935303, "step": 6953 }, { "epoch": 3.75, "learning_rate": 1.0144473880575765e-09, "logits/chosen": -2.143033027648926, "logits/rejected": -2.3459601402282715, "logps/chosen": -2.6948976516723633, "logps/rejected": -2.749098062515259, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.826479434967041, "rewards/margins": 0.019905269145965576, "rewards/rejected": 0.8065741658210754, "step": 6954 }, { "epoch": 3.75, "learning_rate": 1.0100754238096586e-09, "logits/chosen": -1.9807277917861938, "logits/rejected": -2.315877914428711, "logps/chosen": -0.09297355264425278, "logps/rejected": -0.10991586744785309, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.9219269752502441, "rewards/margins": 0.019731879234313965, "rewards/rejected": 0.9021950960159302, "step": 6955 }, { "epoch": 3.75, "learning_rate": 1.0057128047141595e-09, "logits/chosen": -2.2575607299804688, "logits/rejected": -2.0620415210723877, "logps/chosen": -40.11151885986328, "logps/rejected": -4.052960395812988, "loss": 0.072, "rewards/accuracies": 1.0, "rewards/chosen": 3.0934479236602783, "rewards/margins": 2.5954034328460693, "rewards/rejected": 0.49804458022117615, "step": 6956 }, { "epoch": 3.75, "learning_rate": 1.001359531603263e-09, "logits/chosen": -2.0078341960906982, "logits/rejected": -2.0087437629699707, "logps/chosen": -0.16250619292259216, "logps/rejected": -9.563514709472656, "loss": 0.3189, "rewards/accuracies": 1.0, "rewards/chosen": 1.0658422708511353, "rewards/margins": 0.9790453910827637, "rewards/rejected": 0.08679685741662979, "step": 6957 }, { "epoch": 3.75, "learning_rate": 9.97015605307394e-10, "logits/chosen": -1.9942102432250977, "logits/rejected": -2.2628512382507324, "logps/chosen": -1.3602943420410156, "logps/rejected": -1.4208356142044067, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.9399017691612244, "rewards/margins": 0.013084828853607178, "rewards/rejected": 0.9268169403076172, "step": 6958 }, { "epoch": 3.75, "learning_rate": 9.926810266551843e-10, "logits/chosen": -2.05879807472229, "logits/rejected": -2.277852773666382, "logps/chosen": -0.34414321184158325, "logps/rejected": -0.36284950375556946, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.8444514274597168, "rewards/margins": 0.015483200550079346, "rewards/rejected": 0.8289682269096375, "step": 6959 }, { "epoch": 3.75, "learning_rate": 9.883557964734778e-10, "logits/chosen": -2.068039894104004, "logits/rejected": -2.060385227203369, "logps/chosen": -7.589881896972656, "logps/rejected": -4.292112350463867, "loss": 0.3012, "rewards/accuracies": 1.0, "rewards/chosen": 1.605485200881958, "rewards/margins": 1.045706033706665, "rewards/rejected": 0.559779167175293, "step": 6960 }, { "epoch": 3.75, "learning_rate": 9.84039915587348e-10, "logits/chosen": -2.00756573677063, "logits/rejected": -2.2474334239959717, "logps/chosen": -0.1156926080584526, "logps/rejected": -0.12043110281229019, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9121124148368835, "rewards/margins": 0.010378241539001465, "rewards/rejected": 0.9017341732978821, "step": 6961 }, { "epoch": 3.76, "learning_rate": 9.797333848200696e-10, "logits/chosen": -2.0571696758270264, "logits/rejected": -2.0582616329193115, "logps/chosen": -3.6495189666748047, "logps/rejected": -4.226757526397705, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 1.1247421503067017, "rewards/margins": 0.4265046715736389, "rewards/rejected": 0.6982374787330627, "step": 6962 }, { "epoch": 3.76, "learning_rate": 9.75436204993152e-10, "logits/chosen": -1.9842911958694458, "logits/rejected": -1.98520827293396, "logps/chosen": -2.9674038887023926, "logps/rejected": -0.5712369680404663, "loss": 0.4606, "rewards/accuracies": 1.0, "rewards/chosen": 1.4542423486709595, "rewards/margins": 0.5360447764396667, "rewards/rejected": 0.9181975722312927, "step": 6963 }, { "epoch": 3.76, "learning_rate": 9.711483769263008e-10, "logits/chosen": -1.9421221017837524, "logits/rejected": -2.2919018268585205, "logps/chosen": -0.1052795946598053, "logps/rejected": -0.11497087776660919, "loss": 0.6695, "rewards/accuracies": 1.0, "rewards/chosen": 0.8928093910217285, "rewards/margins": 0.04779183864593506, "rewards/rejected": 0.8450175523757935, "step": 6964 }, { "epoch": 3.76, "learning_rate": 9.66869901437456e-10, "logits/chosen": -2.047100782394409, "logits/rejected": -2.3257248401641846, "logps/chosen": -0.3106590211391449, "logps/rejected": -0.26613980531692505, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.9654423594474792, "rewards/margins": 0.027366161346435547, "rewards/rejected": 0.9380761981010437, "step": 6965 }, { "epoch": 3.76, "learning_rate": 9.626007793427592e-10, "logits/chosen": -2.1775741577148438, "logits/rejected": -2.1737349033355713, "logps/chosen": -2.940463066101074, "logps/rejected": -4.05683708190918, "loss": 0.3215, "rewards/accuracies": 1.0, "rewards/chosen": 1.600930094718933, "rewards/margins": 0.9697713255882263, "rewards/rejected": 0.6311587691307068, "step": 6966 }, { "epoch": 3.76, "learning_rate": 9.58341011456576e-10, "logits/chosen": -2.13555908203125, "logits/rejected": -2.3017048835754395, "logps/chosen": -0.13614436984062195, "logps/rejected": -0.15355846285820007, "loss": 0.7007, "rewards/accuracies": 0.0, "rewards/chosen": 0.9310650825500488, "rewards/margins": -0.014983177185058594, "rewards/rejected": 0.9460482597351074, "step": 6967 }, { "epoch": 3.76, "learning_rate": 9.540905985914894e-10, "logits/chosen": -2.1851367950439453, "logits/rejected": -2.107517719268799, "logps/chosen": -26.74169921875, "logps/rejected": -3.227893829345703, "loss": 0.2628, "rewards/accuracies": 1.0, "rewards/chosen": 1.8709449768066406, "rewards/margins": 1.2022193670272827, "rewards/rejected": 0.6687256097793579, "step": 6968 }, { "epoch": 3.76, "learning_rate": 9.498495415582842e-10, "logits/chosen": -2.124232053756714, "logits/rejected": -2.131988286972046, "logps/chosen": -4.5484724044799805, "logps/rejected": -2.048614025115967, "loss": 0.1965, "rewards/accuracies": 1.0, "rewards/chosen": 2.1829891204833984, "rewards/margins": 1.527133584022522, "rewards/rejected": 0.6558555364608765, "step": 6969 }, { "epoch": 3.76, "learning_rate": 9.456178411659743e-10, "logits/chosen": -2.0776782035827637, "logits/rejected": -2.270775079727173, "logps/chosen": -0.3512870669364929, "logps/rejected": -0.3158440589904785, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.827614426612854, "rewards/margins": 0.009731709957122803, "rewards/rejected": 0.8178827166557312, "step": 6970 }, { "epoch": 3.76, "learning_rate": 9.41395498221792e-10, "logits/chosen": -2.1607885360717773, "logits/rejected": -2.2428581714630127, "logps/chosen": -6.534567356109619, "logps/rejected": -4.991089820861816, "loss": 0.7254, "rewards/accuracies": 0.0, "rewards/chosen": 0.7870034575462341, "rewards/margins": -0.06358563899993896, "rewards/rejected": 0.8505890965461731, "step": 6971 }, { "epoch": 3.76, "learning_rate": 9.37182513531165e-10, "logits/chosen": -2.105963945388794, "logits/rejected": -2.251765012741089, "logps/chosen": -0.45746010541915894, "logps/rejected": -0.6428494453430176, "loss": 0.7116, "rewards/accuracies": 0.0, "rewards/chosen": 1.117928147315979, "rewards/margins": -0.036562561988830566, "rewards/rejected": 1.1544907093048096, "step": 6972 }, { "epoch": 3.76, "learning_rate": 9.329788878977507e-10, "logits/chosen": -2.1152026653289795, "logits/rejected": -2.1349730491638184, "logps/chosen": -1.6126148700714111, "logps/rejected": -4.082728385925293, "loss": 0.5116, "rewards/accuracies": 1.0, "rewards/chosen": 1.0986427068710327, "rewards/margins": 0.40353256464004517, "rewards/rejected": 0.6951101422309875, "step": 6973 }, { "epoch": 3.76, "learning_rate": 9.287846221234297e-10, "logits/chosen": -2.126405954360962, "logits/rejected": -2.124781847000122, "logps/chosen": -1.0196365118026733, "logps/rejected": -3.626819133758545, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 1.0023643970489502, "rewards/margins": 0.43847179412841797, "rewards/rejected": 0.5638926029205322, "step": 6974 }, { "epoch": 3.76, "learning_rate": 9.245997170082731e-10, "logits/chosen": -2.129448890686035, "logits/rejected": -2.291416883468628, "logps/chosen": -0.1645374745130539, "logps/rejected": -0.17318207025527954, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9536532759666443, "rewards/margins": 0.004889786243438721, "rewards/rejected": 0.9487634897232056, "step": 6975 }, { "epoch": 3.76, "learning_rate": 9.204241733505869e-10, "logits/chosen": -2.1418111324310303, "logits/rejected": -2.3048274517059326, "logps/chosen": -6.595357894897461, "logps/rejected": -6.915500640869141, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 1.1175631284713745, "rewards/margins": 0.007429957389831543, "rewards/rejected": 1.110133171081543, "step": 6976 }, { "epoch": 3.76, "learning_rate": 9.162579919468838e-10, "logits/chosen": -2.0666286945343018, "logits/rejected": -2.067169427871704, "logps/chosen": -5.125608444213867, "logps/rejected": -5.509029865264893, "loss": 0.4251, "rewards/accuracies": 1.0, "rewards/chosen": 1.206347107887268, "rewards/margins": 0.6353763937950134, "rewards/rejected": 0.5709707140922546, "step": 6977 }, { "epoch": 3.76, "learning_rate": 9.121011735918894e-10, "logits/chosen": -2.238833427429199, "logits/rejected": -2.1112115383148193, "logps/chosen": -20.067276000976562, "logps/rejected": -11.197700500488281, "loss": 0.2201, "rewards/accuracies": 1.0, "rewards/chosen": 1.9338845014572144, "rewards/margins": 1.4013617038726807, "rewards/rejected": 0.5325227975845337, "step": 6978 }, { "epoch": 3.76, "learning_rate": 9.079537190785358e-10, "logits/chosen": -2.157839298248291, "logits/rejected": -2.322575092315674, "logps/chosen": -2.8681681156158447, "logps/rejected": -2.7956299781799316, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 1.016614317893982, "rewards/margins": 0.01621103286743164, "rewards/rejected": 1.0004032850265503, "step": 6979 }, { "epoch": 3.76, "learning_rate": 9.038156291979849e-10, "logits/chosen": -2.0886356830596924, "logits/rejected": -2.083324909210205, "logps/chosen": -3.2964963912963867, "logps/rejected": -10.579367637634277, "loss": 0.4826, "rewards/accuracies": 1.0, "rewards/chosen": 1.3482555150985718, "rewards/margins": 0.47761809825897217, "rewards/rejected": 0.8706374168395996, "step": 6980 }, { "epoch": 3.77, "learning_rate": 8.99686904739605e-10, "logits/chosen": -2.169647455215454, "logits/rejected": -2.277038097381592, "logps/chosen": -0.16240175068378448, "logps/rejected": -0.18076194822788239, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.8703107237815857, "rewards/margins": 0.014466702938079834, "rewards/rejected": 0.8558440208435059, "step": 6981 }, { "epoch": 3.77, "learning_rate": 8.955675464909773e-10, "logits/chosen": -2.138479709625244, "logits/rejected": -2.313988208770752, "logps/chosen": -0.4821295440196991, "logps/rejected": -0.5012339949607849, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.9751964807510376, "rewards/margins": 0.008039295673370361, "rewards/rejected": 0.9671571850776672, "step": 6982 }, { "epoch": 3.77, "learning_rate": 8.9145755523789e-10, "logits/chosen": -2.228970527648926, "logits/rejected": -2.316505193710327, "logps/chosen": -15.16873836517334, "logps/rejected": -7.598468780517578, "loss": 0.5503, "rewards/accuracies": 1.0, "rewards/chosen": 1.3493415117263794, "rewards/margins": 0.30961716175079346, "rewards/rejected": 1.039724349975586, "step": 6983 }, { "epoch": 3.77, "learning_rate": 8.873569317643603e-10, "logits/chosen": -2.168659210205078, "logits/rejected": -2.3001949787139893, "logps/chosen": -4.033271312713623, "logps/rejected": -26.88874626159668, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": 1.318155288696289, "rewards/margins": 0.8825814723968506, "rewards/rejected": 0.4355737864971161, "step": 6984 }, { "epoch": 3.77, "learning_rate": 8.83265676852607e-10, "logits/chosen": -2.15226149559021, "logits/rejected": -2.273791551589966, "logps/chosen": -0.8003080487251282, "logps/rejected": -0.812707781791687, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.9382616877555847, "rewards/margins": 0.01668691635131836, "rewards/rejected": 0.9215747714042664, "step": 6985 }, { "epoch": 3.77, "learning_rate": 8.791837912830613e-10, "logits/chosen": -2.0543057918548584, "logits/rejected": -2.2962114810943604, "logps/chosen": -1.4149348735809326, "logps/rejected": -1.2193851470947266, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 1.0875853300094604, "rewards/margins": 0.00352323055267334, "rewards/rejected": 1.084062099456787, "step": 6986 }, { "epoch": 3.77, "learning_rate": 8.751112758343782e-10, "logits/chosen": -2.0907084941864014, "logits/rejected": -2.079751491546631, "logps/chosen": -0.9883326292037964, "logps/rejected": -12.768952369689941, "loss": 0.2513, "rewards/accuracies": 1.0, "rewards/chosen": 1.1635165214538574, "rewards/margins": 1.2530171871185303, "rewards/rejected": -0.08950071781873703, "step": 6987 }, { "epoch": 3.77, "learning_rate": 8.710481312834028e-10, "logits/chosen": -1.9707040786743164, "logits/rejected": -2.3116037845611572, "logps/chosen": -0.6229001879692078, "logps/rejected": -10.605043411254883, "loss": 0.8376, "rewards/accuracies": 0.0, "rewards/chosen": 0.8901548385620117, "rewards/margins": -0.2706562280654907, "rewards/rejected": 1.1608110666275024, "step": 6988 }, { "epoch": 3.77, "learning_rate": 8.669943584052208e-10, "logits/chosen": -2.240450859069824, "logits/rejected": -2.233504295349121, "logps/chosen": -2.8972010612487793, "logps/rejected": -1.7641898393630981, "loss": 0.489, "rewards/accuracies": 1.0, "rewards/chosen": 1.1990196704864502, "rewards/margins": 0.46086037158966064, "rewards/rejected": 0.7381592988967896, "step": 6989 }, { "epoch": 3.77, "learning_rate": 8.629499579731025e-10, "logits/chosen": -2.161928653717041, "logits/rejected": -2.2905683517456055, "logps/chosen": -1.3739959001541138, "logps/rejected": -3.664240598678589, "loss": 0.7104, "rewards/accuracies": 0.0, "rewards/chosen": 0.9283012747764587, "rewards/margins": -0.03416162729263306, "rewards/rejected": 0.9624629020690918, "step": 6990 }, { "epoch": 3.77, "learning_rate": 8.589149307585586e-10, "logits/chosen": -2.10872483253479, "logits/rejected": -2.044658899307251, "logps/chosen": -17.38861656188965, "logps/rejected": -3.5452873706817627, "loss": 0.24, "rewards/accuracies": 1.0, "rewards/chosen": 1.9727877378463745, "rewards/margins": 1.3045268058776855, "rewards/rejected": 0.6682608723640442, "step": 6991 }, { "epoch": 3.77, "learning_rate": 8.548892775312844e-10, "logits/chosen": -2.0550904273986816, "logits/rejected": -2.269007921218872, "logps/chosen": -0.2031414806842804, "logps/rejected": -0.1998438835144043, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9123687744140625, "rewards/margins": 0.01647716760635376, "rewards/rejected": 0.8958916068077087, "step": 6992 }, { "epoch": 3.77, "learning_rate": 8.508729990592101e-10, "logits/chosen": -2.032663345336914, "logits/rejected": -2.0318713188171387, "logps/chosen": -0.1683628410100937, "logps/rejected": -6.072384834289551, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 0.9831501245498657, "rewards/margins": 0.6724903583526611, "rewards/rejected": 0.310659795999527, "step": 6993 }, { "epoch": 3.77, "learning_rate": 8.468660961084673e-10, "logits/chosen": -2.1896884441375732, "logits/rejected": -2.180711507797241, "logps/chosen": -6.4528584480285645, "logps/rejected": -2.660355567932129, "loss": 0.458, "rewards/accuracies": 1.0, "rewards/chosen": 1.293391466140747, "rewards/margins": 0.5431626439094543, "rewards/rejected": 0.7502288222312927, "step": 6994 }, { "epoch": 3.77, "learning_rate": 8.42868569443389e-10, "logits/chosen": -2.2207469940185547, "logits/rejected": -2.2161803245544434, "logps/chosen": -2.6463205814361572, "logps/rejected": -7.393806457519531, "loss": 0.3809, "rewards/accuracies": 1.0, "rewards/chosen": 1.0422961711883545, "rewards/margins": 0.7686681747436523, "rewards/rejected": 0.27362796664237976, "step": 6995 }, { "epoch": 3.77, "learning_rate": 8.38880419826532e-10, "logits/chosen": -1.9922552108764648, "logits/rejected": -1.9920347929000854, "logps/chosen": -1.3974636793136597, "logps/rejected": -3.0904197692871094, "loss": 0.566, "rewards/accuracies": 1.0, "rewards/chosen": 1.1390317678451538, "rewards/margins": 0.2727453112602234, "rewards/rejected": 0.8662864565849304, "step": 6996 }, { "epoch": 3.77, "learning_rate": 8.349016480186654e-10, "logits/chosen": -2.0741384029388428, "logits/rejected": -2.0861382484436035, "logps/chosen": -2.196014881134033, "logps/rejected": -2.7959094047546387, "loss": 0.508, "rewards/accuracies": 1.0, "rewards/chosen": 1.0518678426742554, "rewards/margins": 0.41264963150024414, "rewards/rejected": 0.6392182111740112, "step": 6997 }, { "epoch": 3.77, "learning_rate": 8.309322547787711e-10, "logits/chosen": -2.0336763858795166, "logits/rejected": -2.029040813446045, "logps/chosen": -2.384906768798828, "logps/rejected": -3.740147590637207, "loss": 0.6046, "rewards/accuracies": 1.0, "rewards/chosen": 1.1444858312606812, "rewards/margins": 0.18564516305923462, "rewards/rejected": 0.9588406682014465, "step": 6998 }, { "epoch": 3.78, "learning_rate": 8.269722408640323e-10, "logits/chosen": -2.1163954734802246, "logits/rejected": -2.3214173316955566, "logps/chosen": -0.09593939781188965, "logps/rejected": -0.10275822132825851, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 1.044464349746704, "rewards/margins": 0.02437591552734375, "rewards/rejected": 1.0200884342193604, "step": 6999 }, { "epoch": 3.78, "learning_rate": 8.230216070298502e-10, "logits/chosen": -2.188823699951172, "logits/rejected": -2.2163455486297607, "logps/chosen": -14.024247169494629, "logps/rejected": -3.0329625606536865, "loss": 0.3471, "rewards/accuracies": 1.0, "rewards/chosen": 1.9782425165176392, "rewards/margins": 0.8796335458755493, "rewards/rejected": 1.0986089706420898, "step": 7000 }, { "epoch": 3.78, "learning_rate": 8.19080354029833e-10, "logits/chosen": -2.0845813751220703, "logits/rejected": -2.0865652561187744, "logps/chosen": -4.256758689880371, "logps/rejected": -1.9150474071502686, "loss": 0.2847, "rewards/accuracies": 1.0, "rewards/chosen": 1.7436245679855347, "rewards/margins": 1.1104214191436768, "rewards/rejected": 0.6332032084465027, "step": 7001 }, { "epoch": 3.78, "learning_rate": 8.151484826157961e-10, "logits/chosen": -2.119497776031494, "logits/rejected": -2.346536636352539, "logps/chosen": -0.3798474073410034, "logps/rejected": -0.39016813039779663, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.9486730694770813, "rewards/margins": 0.0029069185256958008, "rewards/rejected": 0.9457661509513855, "step": 7002 }, { "epoch": 3.78, "learning_rate": 8.112259935377785e-10, "logits/chosen": -2.1565773487091064, "logits/rejected": -2.3257861137390137, "logps/chosen": -7.963825225830078, "logps/rejected": -8.489665985107422, "loss": 0.4352, "rewards/accuracies": 1.0, "rewards/chosen": 1.2077337503433228, "rewards/margins": 0.6064303517341614, "rewards/rejected": 0.6013033986091614, "step": 7003 }, { "epoch": 3.78, "learning_rate": 8.07312887544015e-10, "logits/chosen": -2.058516263961792, "logits/rejected": -2.2640485763549805, "logps/chosen": -0.5414240956306458, "logps/rejected": -0.5633671879768372, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 0.8355309367179871, "rewards/margins": 0.025268137454986572, "rewards/rejected": 0.8102627992630005, "step": 7004 }, { "epoch": 3.78, "learning_rate": 8.034091653809583e-10, "logits/chosen": -1.9885470867156982, "logits/rejected": -1.9953157901763916, "logps/chosen": -2.46370267868042, "logps/rejected": -5.116384029388428, "loss": 0.3342, "rewards/accuracies": 1.0, "rewards/chosen": 1.392000436782837, "rewards/margins": 0.924328088760376, "rewards/rejected": 0.46767231822013855, "step": 7005 }, { "epoch": 3.78, "learning_rate": 7.995148277932684e-10, "logits/chosen": -2.104429244995117, "logits/rejected": -2.1130075454711914, "logps/chosen": -2.3049979209899902, "logps/rejected": -7.13352108001709, "loss": 0.3299, "rewards/accuracies": 1.0, "rewards/chosen": 1.1663182973861694, "rewards/margins": 0.939620852470398, "rewards/rejected": 0.22669744491577148, "step": 7006 }, { "epoch": 3.78, "learning_rate": 7.956298755238177e-10, "logits/chosen": -2.1167659759521484, "logits/rejected": -2.125859260559082, "logps/chosen": -15.35582447052002, "logps/rejected": -8.412900924682617, "loss": 0.1458, "rewards/accuracies": 1.0, "rewards/chosen": 2.3670098781585693, "rewards/margins": 1.8517475128173828, "rewards/rejected": 0.5152624249458313, "step": 7007 }, { "epoch": 3.78, "learning_rate": 7.917543093136858e-10, "logits/chosen": -2.1702029705047607, "logits/rejected": -2.173393964767456, "logps/chosen": -1.339745283126831, "logps/rejected": -5.965199947357178, "loss": 0.4073, "rewards/accuracies": 1.0, "rewards/chosen": 1.0886540412902832, "rewards/margins": 0.6876010894775391, "rewards/rejected": 0.40105292201042175, "step": 7008 }, { "epoch": 3.78, "learning_rate": 7.878881299021645e-10, "logits/chosen": -2.100808620452881, "logits/rejected": -2.0930402278900146, "logps/chosen": -7.02599573135376, "logps/rejected": -6.592644691467285, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 1.6701561212539673, "rewards/margins": 1.2407863140106201, "rewards/rejected": 0.42936983704566956, "step": 7009 }, { "epoch": 3.78, "learning_rate": 7.840313380267471e-10, "logits/chosen": -2.191206932067871, "logits/rejected": -2.2623820304870605, "logps/chosen": -5.881730079650879, "logps/rejected": -1.725727915763855, "loss": 0.7537, "rewards/accuracies": 0.0, "rewards/chosen": 0.7496616244316101, "rewards/margins": -0.11756938695907593, "rewards/rejected": 0.867231011390686, "step": 7010 }, { "epoch": 3.78, "learning_rate": 7.801839344231454e-10, "logits/chosen": -1.9612149000167847, "logits/rejected": -2.2654833793640137, "logps/chosen": -1.0019218921661377, "logps/rejected": -0.95968097448349, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8380069136619568, "rewards/margins": 0.012197792530059814, "rewards/rejected": 0.825809121131897, "step": 7011 }, { "epoch": 3.78, "learning_rate": 7.76345919825283e-10, "logits/chosen": -2.1124801635742188, "logits/rejected": -2.122901201248169, "logps/chosen": -1.8253072500228882, "logps/rejected": -1.9122428894042969, "loss": 0.4967, "rewards/accuracies": 1.0, "rewards/chosen": 1.21066415309906, "rewards/margins": 0.4410431385040283, "rewards/rejected": 0.7696210145950317, "step": 7012 }, { "epoch": 3.78, "learning_rate": 7.725172949652747e-10, "logits/chosen": -2.1769936084747314, "logits/rejected": -2.1816537380218506, "logps/chosen": -1.8923797607421875, "logps/rejected": -5.101036071777344, "loss": 0.326, "rewards/accuracies": 1.0, "rewards/chosen": 1.3215607404708862, "rewards/margins": 0.953609049320221, "rewards/rejected": 0.3679516911506653, "step": 7013 }, { "epoch": 3.78, "learning_rate": 7.686980605734694e-10, "logits/chosen": -2.0683021545410156, "logits/rejected": -2.3116655349731445, "logps/chosen": -0.17221777141094208, "logps/rejected": -0.17015209794044495, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": 0.9880228042602539, "rewards/margins": 0.040808022022247314, "rewards/rejected": 0.9472147822380066, "step": 7014 }, { "epoch": 3.78, "learning_rate": 7.64888217378401e-10, "logits/chosen": -2.189124822616577, "logits/rejected": -2.1385748386383057, "logps/chosen": -15.369172096252441, "logps/rejected": -3.5083706378936768, "loss": 0.2615, "rewards/accuracies": 1.0, "rewards/chosen": 1.7924377918243408, "rewards/margins": 1.2078473567962646, "rewards/rejected": 0.5845904350280762, "step": 7015 }, { "epoch": 3.78, "learning_rate": 7.610877661068271e-10, "logits/chosen": -2.1713905334472656, "logits/rejected": -2.1710944175720215, "logps/chosen": -2.534700632095337, "logps/rejected": -8.385563850402832, "loss": 0.2951, "rewards/accuracies": 1.0, "rewards/chosen": 1.0883845090866089, "rewards/margins": 1.069349765777588, "rewards/rejected": 0.019034767523407936, "step": 7016 }, { "epoch": 3.78, "learning_rate": 7.572967074837066e-10, "logits/chosen": -2.0744118690490723, "logits/rejected": -2.2984344959259033, "logps/chosen": -0.22487597167491913, "logps/rejected": -0.2537524998188019, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 1.035481572151184, "rewards/margins": 0.03183245658874512, "rewards/rejected": 1.003649115562439, "step": 7017 }, { "epoch": 3.79, "learning_rate": 7.535150422322056e-10, "logits/chosen": -2.046609401702881, "logits/rejected": -2.0475239753723145, "logps/chosen": -0.515148937702179, "logps/rejected": -5.947091102600098, "loss": 0.4668, "rewards/accuracies": 1.0, "rewards/chosen": 0.8995270133018494, "rewards/margins": 0.5194344520568848, "rewards/rejected": 0.3800925314426422, "step": 7018 }, { "epoch": 3.79, "learning_rate": 7.497427710737136e-10, "logits/chosen": -1.9693599939346313, "logits/rejected": -1.9694281816482544, "logps/chosen": -0.22368797659873962, "logps/rejected": -7.437396049499512, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 0.9759065508842468, "rewards/margins": 0.6406328678131104, "rewards/rejected": 0.3352736532688141, "step": 7019 }, { "epoch": 3.79, "learning_rate": 7.45979894727805e-10, "logits/chosen": -2.1851696968078613, "logits/rejected": -2.1798653602600098, "logps/chosen": -7.233797550201416, "logps/rejected": -6.990306854248047, "loss": 0.3205, "rewards/accuracies": 1.0, "rewards/chosen": 1.2448762655258179, "rewards/margins": 0.9733340740203857, "rewards/rejected": 0.27154216170310974, "step": 7020 }, { "epoch": 3.79, "learning_rate": 7.422264139122835e-10, "logits/chosen": -2.004777669906616, "logits/rejected": -1.9997788667678833, "logps/chosen": -2.2894585132598877, "logps/rejected": -1.8395479917526245, "loss": 0.5705, "rewards/accuracies": 1.0, "rewards/chosen": 1.2989636659622192, "rewards/margins": 0.26243042945861816, "rewards/rejected": 1.036533236503601, "step": 7021 }, { "epoch": 3.79, "learning_rate": 7.384823293431375e-10, "logits/chosen": -2.2134854793548584, "logits/rejected": -2.205704927444458, "logps/chosen": -5.998824119567871, "logps/rejected": -8.032708168029785, "loss": 0.2478, "rewards/accuracies": 1.0, "rewards/chosen": 1.3935261964797974, "rewards/margins": 1.2686294317245483, "rewards/rejected": 0.1248968169093132, "step": 7022 }, { "epoch": 3.79, "learning_rate": 7.347476417345844e-10, "logits/chosen": -1.978749394416809, "logits/rejected": -2.2253072261810303, "logps/chosen": -0.5382050275802612, "logps/rejected": -0.5890403389930725, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.8538430333137512, "rewards/margins": 0.02162754535675049, "rewards/rejected": 0.8322154879570007, "step": 7023 }, { "epoch": 3.79, "learning_rate": 7.310223517990488e-10, "logits/chosen": -2.0577831268310547, "logits/rejected": -2.2496566772460938, "logps/chosen": -0.23061832785606384, "logps/rejected": -0.19546103477478027, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.9685909152030945, "rewards/margins": 0.021258056163787842, "rewards/rejected": 0.9473328590393066, "step": 7024 }, { "epoch": 3.79, "learning_rate": 7.273064602471346e-10, "logits/chosen": -2.0873308181762695, "logits/rejected": -2.093193531036377, "logps/chosen": -0.9571812748908997, "logps/rejected": -1.8383255004882812, "loss": 0.4829, "rewards/accuracies": 1.0, "rewards/chosen": 1.1992028951644897, "rewards/margins": 0.4767189621925354, "rewards/rejected": 0.7224839329719543, "step": 7025 }, { "epoch": 3.79, "learning_rate": 7.235999677876858e-10, "logits/chosen": -2.0770349502563477, "logits/rejected": -2.2687270641326904, "logps/chosen": -0.21846267580986023, "logps/rejected": -0.22843389213085175, "loss": 0.7027, "rewards/accuracies": 0.0, "rewards/chosen": 0.8435363173484802, "rewards/margins": -0.01903444528579712, "rewards/rejected": 0.8625707626342773, "step": 7026 }, { "epoch": 3.79, "learning_rate": 7.199028751277369e-10, "logits/chosen": -2.0228259563446045, "logits/rejected": -2.3553085327148438, "logps/chosen": -2.2850375175476074, "logps/rejected": -3.1957626342773438, "loss": 0.6528, "rewards/accuracies": 1.0, "rewards/chosen": 1.0326544046401978, "rewards/margins": 0.08234184980392456, "rewards/rejected": 0.9503125548362732, "step": 7027 }, { "epoch": 3.79, "learning_rate": 7.162151829725293e-10, "logits/chosen": -2.0600712299346924, "logits/rejected": -2.2635464668273926, "logps/chosen": -0.4557689130306244, "logps/rejected": -0.4492456912994385, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.9785749316215515, "rewards/margins": 0.016750872135162354, "rewards/rejected": 0.9618240594863892, "step": 7028 }, { "epoch": 3.79, "learning_rate": 7.125368920255171e-10, "logits/chosen": -2.2018442153930664, "logits/rejected": -2.209650754928589, "logps/chosen": -1.197449803352356, "logps/rejected": -2.093019723892212, "loss": 0.4604, "rewards/accuracies": 1.0, "rewards/chosen": 1.0869101285934448, "rewards/margins": 0.536754310131073, "rewards/rejected": 0.5501558184623718, "step": 7029 }, { "epoch": 3.79, "learning_rate": 7.088680029883665e-10, "logits/chosen": -2.166752338409424, "logits/rejected": -2.1274983882904053, "logps/chosen": -19.559045791625977, "logps/rejected": -3.360419511795044, "loss": 0.1932, "rewards/accuracies": 1.0, "rewards/chosen": 2.1369216442108154, "rewards/margins": 1.54610013961792, "rewards/rejected": 0.5908215641975403, "step": 7030 }, { "epoch": 3.79, "learning_rate": 7.052085165609345e-10, "logits/chosen": -2.0724334716796875, "logits/rejected": -2.3075125217437744, "logps/chosen": -6.885558128356934, "logps/rejected": -4.608102798461914, "loss": 0.7316, "rewards/accuracies": 0.0, "rewards/chosen": 1.0313161611557007, "rewards/margins": -0.07545459270477295, "rewards/rejected": 1.1067707538604736, "step": 7031 }, { "epoch": 3.79, "learning_rate": 7.015584334412961e-10, "logits/chosen": -2.0688395500183105, "logits/rejected": -2.076117515563965, "logps/chosen": -1.8992500305175781, "logps/rejected": -5.292647361755371, "loss": 0.4515, "rewards/accuracies": 1.0, "rewards/chosen": 1.0242236852645874, "rewards/margins": 0.5608587265014648, "rewards/rejected": 0.46336498856544495, "step": 7032 }, { "epoch": 3.79, "learning_rate": 6.97917754325722e-10, "logits/chosen": -1.9951122999191284, "logits/rejected": -2.0643749237060547, "logps/chosen": -3.678161382675171, "logps/rejected": -23.552541732788086, "loss": 0.245, "rewards/accuracies": 1.0, "rewards/chosen": 1.6126006841659546, "rewards/margins": 1.2817128896713257, "rewards/rejected": 0.3308877944946289, "step": 7033 }, { "epoch": 3.79, "learning_rate": 6.942864799087011e-10, "logits/chosen": -2.0231614112854004, "logits/rejected": -2.034071445465088, "logps/chosen": -2.482736825942993, "logps/rejected": -1.762903094291687, "loss": 0.4106, "rewards/accuracies": 1.0, "rewards/chosen": 1.3517043590545654, "rewards/margins": 0.6777977347373962, "rewards/rejected": 0.6739066243171692, "step": 7034 }, { "epoch": 3.79, "learning_rate": 6.906646108829239e-10, "logits/chosen": -2.0653960704803467, "logits/rejected": -2.0747218132019043, "logps/chosen": -4.697119235992432, "logps/rejected": -3.398655414581299, "loss": 0.4394, "rewards/accuracies": 1.0, "rewards/chosen": 1.1356416940689087, "rewards/margins": 0.5945595502853394, "rewards/rejected": 0.5410821437835693, "step": 7035 }, { "epoch": 3.8, "learning_rate": 6.870521479392876e-10, "logits/chosen": -2.107041835784912, "logits/rejected": -2.3081467151641846, "logps/chosen": -0.07329831272363663, "logps/rejected": -0.0796007364988327, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.9390174150466919, "rewards/margins": 0.011092841625213623, "rewards/rejected": 0.9279245734214783, "step": 7036 }, { "epoch": 3.8, "learning_rate": 6.834490917668856e-10, "logits/chosen": -2.1383540630340576, "logits/rejected": -2.1345763206481934, "logps/chosen": -3.93078875541687, "logps/rejected": -1.4725046157836914, "loss": 0.3789, "rewards/accuracies": 1.0, "rewards/chosen": 1.6194227933883667, "rewards/margins": 0.7749168276786804, "rewards/rejected": 0.8445059657096863, "step": 7037 }, { "epoch": 3.8, "learning_rate": 6.79855443053029e-10, "logits/chosen": -2.0792810916900635, "logits/rejected": -2.07859468460083, "logps/chosen": -0.34525662660598755, "logps/rejected": -2.3334102630615234, "loss": 0.5804, "rewards/accuracies": 1.0, "rewards/chosen": 1.0080053806304932, "rewards/margins": 0.23987847566604614, "rewards/rejected": 0.768126904964447, "step": 7038 }, { "epoch": 3.8, "learning_rate": 6.762712024832307e-10, "logits/chosen": -2.031717538833618, "logits/rejected": -2.0309760570526123, "logps/chosen": -0.47079959511756897, "logps/rejected": -3.239859104156494, "loss": 0.5008, "rewards/accuracies": 1.0, "rewards/chosen": 1.0463546514511108, "rewards/margins": 0.43070167303085327, "rewards/rejected": 0.6156529784202576, "step": 7039 }, { "epoch": 3.8, "learning_rate": 6.726963707412103e-10, "logits/chosen": -2.1432700157165527, "logits/rejected": -2.24373722076416, "logps/chosen": -4.728043079376221, "logps/rejected": -0.6570221781730652, "loss": 0.534, "rewards/accuracies": 1.0, "rewards/chosen": 1.1950262784957886, "rewards/margins": 0.3485053777694702, "rewards/rejected": 0.8465209007263184, "step": 7040 }, { "epoch": 3.8, "learning_rate": 6.691309485088892e-10, "logits/chosen": -2.1248581409454346, "logits/rejected": -2.1195428371429443, "logps/chosen": -4.561756134033203, "logps/rejected": -5.980248928070068, "loss": 0.3792, "rewards/accuracies": 1.0, "rewards/chosen": 1.1569756269454956, "rewards/margins": 0.7742303609848022, "rewards/rejected": 0.38274523615837097, "step": 7041 }, { "epoch": 3.8, "learning_rate": 6.655749364663899e-10, "logits/chosen": -2.0393433570861816, "logits/rejected": -2.036689281463623, "logps/chosen": -0.21117202937602997, "logps/rejected": -5.675633907318115, "loss": 0.4185, "rewards/accuracies": 1.0, "rewards/chosen": 0.9480161666870117, "rewards/margins": 0.6544484496116638, "rewards/rejected": 0.2935677170753479, "step": 7042 }, { "epoch": 3.8, "learning_rate": 6.62028335292053e-10, "logits/chosen": -2.050267457962036, "logits/rejected": -2.335883855819702, "logps/chosen": -0.9019782543182373, "logps/rejected": -0.9977192878723145, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 1.2651797533035278, "rewards/margins": 0.016363859176635742, "rewards/rejected": 1.248815894126892, "step": 7043 }, { "epoch": 3.8, "learning_rate": 6.584911456624153e-10, "logits/chosen": -2.129275321960449, "logits/rejected": -2.3990273475646973, "logps/chosen": -0.07820277661085129, "logps/rejected": -0.07544171810150146, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.7824110388755798, "rewards/margins": 0.019515395164489746, "rewards/rejected": 0.7628956437110901, "step": 7044 }, { "epoch": 3.8, "learning_rate": 6.549633682522148e-10, "logits/chosen": -2.0378713607788086, "logits/rejected": -2.2296669483184814, "logps/chosen": -1.56899893283844, "logps/rejected": -4.519116401672363, "loss": 0.605, "rewards/accuracies": 1.0, "rewards/chosen": 0.9423600435256958, "rewards/margins": 0.184903085231781, "rewards/rejected": 0.7574569582939148, "step": 7045 }, { "epoch": 3.8, "learning_rate": 6.514450037343966e-10, "logits/chosen": -1.903066635131836, "logits/rejected": -2.255309581756592, "logps/chosen": -0.7573611736297607, "logps/rejected": -0.8291712999343872, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 1.078501582145691, "rewards/margins": -0.005722999572753906, "rewards/rejected": 1.0842245817184448, "step": 7046 }, { "epoch": 3.8, "learning_rate": 6.479360527801181e-10, "logits/chosen": -2.1109628677368164, "logits/rejected": -2.332012176513672, "logps/chosen": -7.985518455505371, "logps/rejected": -9.250581741333008, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 1.081903338432312, "rewards/margins": 0.019747018814086914, "rewards/rejected": 1.062156319618225, "step": 7047 }, { "epoch": 3.8, "learning_rate": 6.444365160587329e-10, "logits/chosen": -2.1005983352661133, "logits/rejected": -2.110851526260376, "logps/chosen": -0.7557240128517151, "logps/rejected": -12.503942489624023, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 0.9665058255195618, "rewards/margins": 0.5518283843994141, "rewards/rejected": 0.4146774411201477, "step": 7048 }, { "epoch": 3.8, "learning_rate": 6.409463942378013e-10, "logits/chosen": -2.1109869480133057, "logits/rejected": -2.2975351810455322, "logps/chosen": -0.8477159142494202, "logps/rejected": -0.7477092742919922, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.9817437529563904, "rewards/margins": -0.002802431583404541, "rewards/rejected": 0.9845461845397949, "step": 7049 }, { "epoch": 3.8, "learning_rate": 6.374656879830853e-10, "logits/chosen": -2.1742167472839355, "logits/rejected": -2.3298144340515137, "logps/chosen": -3.1431596279144287, "logps/rejected": -3.015254020690918, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.7893315553665161, "rewards/margins": -0.0006183385848999023, "rewards/rejected": 0.789949893951416, "step": 7050 }, { "epoch": 3.8, "learning_rate": 6.339943979585538e-10, "logits/chosen": -2.039057493209839, "logits/rejected": -2.0363361835479736, "logps/chosen": -6.498327732086182, "logps/rejected": -5.657126426696777, "loss": 0.2982, "rewards/accuracies": 1.0, "rewards/chosen": 1.4425318241119385, "rewards/margins": 1.0572336912155151, "rewards/rejected": 0.3852981626987457, "step": 7051 }, { "epoch": 3.8, "learning_rate": 6.305325248263771e-10, "logits/chosen": -2.095841646194458, "logits/rejected": -2.1076836585998535, "logps/chosen": -1.2714588642120361, "logps/rejected": -2.5619616508483887, "loss": 0.4838, "rewards/accuracies": 1.0, "rewards/chosen": 0.984640896320343, "rewards/margins": 0.4744220972061157, "rewards/rejected": 0.5102187991142273, "step": 7052 }, { "epoch": 3.8, "learning_rate": 6.270800692469325e-10, "logits/chosen": -2.3563594818115234, "logits/rejected": -2.207951784133911, "logps/chosen": -19.21904182434082, "logps/rejected": -1.719850778579712, "loss": 0.1469, "rewards/accuracies": 1.0, "rewards/chosen": 2.49078631401062, "rewards/margins": 1.8437926769256592, "rewards/rejected": 0.6469935774803162, "step": 7053 }, { "epoch": 3.8, "learning_rate": 6.236370318787987e-10, "logits/chosen": -1.9524579048156738, "logits/rejected": -1.9518460035324097, "logps/chosen": -0.7759802341461182, "logps/rejected": -1.1489428281784058, "loss": 0.6529, "rewards/accuracies": 1.0, "rewards/chosen": 0.862713634967804, "rewards/margins": 0.08215445280075073, "rewards/rejected": 0.7805591821670532, "step": 7054 }, { "epoch": 3.81, "learning_rate": 6.202034133787559e-10, "logits/chosen": -2.0560319423675537, "logits/rejected": -2.0685999393463135, "logps/chosen": -3.2685508728027344, "logps/rejected": -2.622352123260498, "loss": 0.4697, "rewards/accuracies": 1.0, "rewards/chosen": 1.0772863626480103, "rewards/margins": 0.5116896033287048, "rewards/rejected": 0.5655967593193054, "step": 7055 }, { "epoch": 3.81, "learning_rate": 6.167792144017914e-10, "logits/chosen": -1.991755723953247, "logits/rejected": -2.009169340133667, "logps/chosen": -0.6187158226966858, "logps/rejected": -14.007057189941406, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9447178840637207, "rewards/margins": 0.05129289627075195, "rewards/rejected": 0.8934249877929688, "step": 7056 }, { "epoch": 3.81, "learning_rate": 6.133644356010936e-10, "logits/chosen": -2.076535701751709, "logits/rejected": -2.2905101776123047, "logps/chosen": -7.774496078491211, "logps/rejected": -1.9393150806427002, "loss": 0.7428, "rewards/accuracies": 0.0, "rewards/chosen": 0.971688449382782, "rewards/margins": -0.09703081846237183, "rewards/rejected": 1.0687192678451538, "step": 7057 }, { "epoch": 3.81, "learning_rate": 6.099590776280527e-10, "logits/chosen": -2.1949493885040283, "logits/rejected": -2.183734893798828, "logps/chosen": -2.5250954627990723, "logps/rejected": -4.400747299194336, "loss": 0.377, "rewards/accuracies": 1.0, "rewards/chosen": 1.2138519287109375, "rewards/margins": 0.7811476588249207, "rewards/rejected": 0.43270426988601685, "step": 7058 }, { "epoch": 3.81, "learning_rate": 6.065631411322603e-10, "logits/chosen": -2.0520970821380615, "logits/rejected": -2.055694103240967, "logps/chosen": -3.650864362716675, "logps/rejected": -4.056766033172607, "loss": 0.2867, "rewards/accuracies": 1.0, "rewards/chosen": 1.6668342351913452, "rewards/margins": 1.1027097702026367, "rewards/rejected": 0.5641244649887085, "step": 7059 }, { "epoch": 3.81, "learning_rate": 6.031766267615257e-10, "logits/chosen": -2.1069464683532715, "logits/rejected": -2.1180694103240967, "logps/chosen": -1.0961732864379883, "logps/rejected": -5.738288402557373, "loss": 0.5618, "rewards/accuracies": 1.0, "rewards/chosen": 1.1694129705429077, "rewards/margins": 0.2825586199760437, "rewards/rejected": 0.886854350566864, "step": 7060 }, { "epoch": 3.81, "learning_rate": 5.997995351618323e-10, "logits/chosen": -2.269953966140747, "logits/rejected": -2.1841678619384766, "logps/chosen": -22.271831512451172, "logps/rejected": -4.751824855804443, "loss": 0.1674, "rewards/accuracies": 1.0, "rewards/chosen": 2.182894229888916, "rewards/margins": 1.7027406692504883, "rewards/rejected": 0.48015353083610535, "step": 7061 }, { "epoch": 3.81, "learning_rate": 5.964318669773982e-10, "logits/chosen": -2.1215968132019043, "logits/rejected": -2.1522302627563477, "logps/chosen": -4.877481460571289, "logps/rejected": -8.763423919677734, "loss": 0.3793, "rewards/accuracies": 1.0, "rewards/chosen": 1.7099065780639648, "rewards/margins": 0.7736652493476868, "rewards/rejected": 0.9362413287162781, "step": 7062 }, { "epoch": 3.81, "learning_rate": 5.93073622850615e-10, "logits/chosen": -2.002204418182373, "logits/rejected": -2.0270652770996094, "logps/chosen": -6.541129112243652, "logps/rejected": -23.036266326904297, "loss": 0.7144, "rewards/accuracies": 0.0, "rewards/chosen": 1.2634717226028442, "rewards/margins": -0.041988253593444824, "rewards/rejected": 1.305459976196289, "step": 7063 }, { "epoch": 3.81, "learning_rate": 5.897248034220981e-10, "logits/chosen": -2.0067591667175293, "logits/rejected": -2.00638484954834, "logps/chosen": -0.31836676597595215, "logps/rejected": -1.9063063859939575, "loss": 0.5614, "rewards/accuracies": 1.0, "rewards/chosen": 0.8962465524673462, "rewards/margins": 0.2834628224372864, "rewards/rejected": 0.6127837300300598, "step": 7064 }, { "epoch": 3.81, "learning_rate": 5.863854093306586e-10, "logits/chosen": -2.070204257965088, "logits/rejected": -2.069523572921753, "logps/chosen": -2.8406198024749756, "logps/rejected": -6.457264423370361, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": 1.5904772281646729, "rewards/margins": 1.3470343351364136, "rewards/rejected": 0.24344287812709808, "step": 7065 }, { "epoch": 3.81, "learning_rate": 5.830554412133038e-10, "logits/chosen": -2.1169955730438232, "logits/rejected": -2.112494707107544, "logps/chosen": -0.7195650339126587, "logps/rejected": -9.356754302978516, "loss": 0.2878, "rewards/accuracies": 1.0, "rewards/chosen": 1.291269063949585, "rewards/margins": 1.0982203483581543, "rewards/rejected": 0.19304867088794708, "step": 7066 }, { "epoch": 3.81, "learning_rate": 5.797348997052476e-10, "logits/chosen": -2.048068046569824, "logits/rejected": -2.2885639667510986, "logps/chosen": -0.5673633813858032, "logps/rejected": -0.508849561214447, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9063296318054199, "rewards/margins": 0.021521270275115967, "rewards/rejected": 0.884808361530304, "step": 7067 }, { "epoch": 3.81, "learning_rate": 5.764237854399057e-10, "logits/chosen": -1.9686157703399658, "logits/rejected": -2.288100242614746, "logps/chosen": -0.16924050450325012, "logps/rejected": -0.17798033356666565, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 1.0578457117080688, "rewards/margins": 0.0001900196075439453, "rewards/rejected": 1.057655692100525, "step": 7068 }, { "epoch": 3.81, "learning_rate": 5.731220990488949e-10, "logits/chosen": -2.0473036766052246, "logits/rejected": -2.056832790374756, "logps/chosen": -1.083189606666565, "logps/rejected": -4.108565807342529, "loss": 0.4518, "rewards/accuracies": 1.0, "rewards/chosen": 1.0698615312576294, "rewards/margins": 0.5600565671920776, "rewards/rejected": 0.5098049640655518, "step": 7069 }, { "epoch": 3.81, "learning_rate": 5.698298411620395e-10, "logits/chosen": -2.1227245330810547, "logits/rejected": -2.1954517364501953, "logps/chosen": -0.24602864682674408, "logps/rejected": -29.943084716796875, "loss": 0.2109, "rewards/accuracies": 1.0, "rewards/chosen": 0.9755876660346985, "rewards/margins": 1.449257254600525, "rewards/rejected": -0.4736696183681488, "step": 7070 }, { "epoch": 3.81, "learning_rate": 5.66547012407348e-10, "logits/chosen": -2.1285383701324463, "logits/rejected": -2.312915563583374, "logps/chosen": -0.7611238956451416, "logps/rejected": -16.70138931274414, "loss": 0.5493, "rewards/accuracies": 1.0, "rewards/chosen": 1.2425979375839233, "rewards/margins": 0.3119161128997803, "rewards/rejected": 0.9306818246841431, "step": 7071 }, { "epoch": 3.81, "learning_rate": 5.632736134110528e-10, "logits/chosen": -2.02701735496521, "logits/rejected": -2.0280396938323975, "logps/chosen": -1.0556328296661377, "logps/rejected": -3.068495988845825, "loss": 0.5035, "rewards/accuracies": 1.0, "rewards/chosen": 1.1223909854888916, "rewards/margins": 0.42399048805236816, "rewards/rejected": 0.6984004974365234, "step": 7072 }, { "epoch": 3.81, "learning_rate": 5.600096447975655e-10, "logits/chosen": -2.0816338062286377, "logits/rejected": -2.313636541366577, "logps/chosen": -0.676417350769043, "logps/rejected": -0.8158712387084961, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.791775643825531, "rewards/margins": 0.03836202621459961, "rewards/rejected": 0.7534136176109314, "step": 7073 }, { "epoch": 3.82, "learning_rate": 5.56755107189516e-10, "logits/chosen": -2.152121067047119, "logits/rejected": -2.1642987728118896, "logps/chosen": -4.189002990722656, "logps/rejected": -3.8005878925323486, "loss": 0.3863, "rewards/accuracies": 1.0, "rewards/chosen": 1.3356876373291016, "rewards/margins": 0.7517198324203491, "rewards/rejected": 0.5839678049087524, "step": 7074 }, { "epoch": 3.82, "learning_rate": 5.535100012077298e-10, "logits/chosen": -2.0456995964050293, "logits/rejected": -2.2614762783050537, "logps/chosen": -0.3548595905303955, "logps/rejected": -0.4123280346393585, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 1.0269314050674438, "rewards/margins": 0.00798177719116211, "rewards/rejected": 1.0189496278762817, "step": 7075 }, { "epoch": 3.82, "learning_rate": 5.502743274712285e-10, "logits/chosen": -2.00070858001709, "logits/rejected": -1.9991449117660522, "logps/chosen": -1.0827124118804932, "logps/rejected": -6.360836982727051, "loss": 0.3782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0229285955429077, "rewards/margins": 0.7771946787834167, "rewards/rejected": 0.24573393166065216, "step": 7076 }, { "epoch": 3.82, "learning_rate": 5.47048086597246e-10, "logits/chosen": -2.1125826835632324, "logits/rejected": -2.062817096710205, "logps/chosen": -7.852555274963379, "logps/rejected": -6.204162120819092, "loss": 0.3501, "rewards/accuracies": 1.0, "rewards/chosen": 1.650613784790039, "rewards/margins": 0.8693702220916748, "rewards/rejected": 0.7812435626983643, "step": 7077 }, { "epoch": 3.82, "learning_rate": 5.438312792012012e-10, "logits/chosen": -2.0817275047302246, "logits/rejected": -2.339043378829956, "logps/chosen": -15.622238159179688, "logps/rejected": -12.29755973815918, "loss": 0.8055, "rewards/accuracies": 0.0, "rewards/chosen": 0.3423128128051758, "rewards/margins": -0.21337741613388062, "rewards/rejected": 0.5556902289390564, "step": 7078 }, { "epoch": 3.82, "learning_rate": 5.406239058967199e-10, "logits/chosen": -2.0143697261810303, "logits/rejected": -2.02298903465271, "logps/chosen": -1.3999565839767456, "logps/rejected": -3.9701478481292725, "loss": 0.4729, "rewards/accuracies": 1.0, "rewards/chosen": 0.9992552995681763, "rewards/margins": 0.5031611919403076, "rewards/rejected": 0.49609413743019104, "step": 7079 }, { "epoch": 3.82, "learning_rate": 5.374259672956405e-10, "logits/chosen": -2.204035758972168, "logits/rejected": -2.0870862007141113, "logps/chosen": -33.78534698486328, "logps/rejected": -2.1750898361206055, "loss": 0.1206, "rewards/accuracies": 1.0, "rewards/chosen": 2.8048653602600098, "rewards/margins": 2.054396152496338, "rewards/rejected": 0.7504693269729614, "step": 7080 }, { "epoch": 3.82, "learning_rate": 5.342374640079806e-10, "logits/chosen": -2.0279295444488525, "logits/rejected": -2.0076839923858643, "logps/chosen": -27.85989761352539, "logps/rejected": -9.89774227142334, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": 1.9041961431503296, "rewards/margins": 1.6770613193511963, "rewards/rejected": 0.2271348088979721, "step": 7081 }, { "epoch": 3.82, "learning_rate": 5.310583966419758e-10, "logits/chosen": -2.084254503250122, "logits/rejected": -2.2974045276641846, "logps/chosen": -2.4874141216278076, "logps/rejected": -2.708672046661377, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.8667131662368774, "rewards/margins": 0.025140762329101562, "rewards/rejected": 0.8415724039077759, "step": 7082 }, { "epoch": 3.82, "learning_rate": 5.278887658040465e-10, "logits/chosen": -2.011857509613037, "logits/rejected": -2.013659954071045, "logps/chosen": -3.2487313747406006, "logps/rejected": -8.00498104095459, "loss": 0.4519, "rewards/accuracies": 1.0, "rewards/chosen": 0.8989930152893066, "rewards/margins": 0.5597445964813232, "rewards/rejected": 0.339248389005661, "step": 7083 }, { "epoch": 3.82, "learning_rate": 5.247285720988259e-10, "logits/chosen": -2.0920498371124268, "logits/rejected": -2.2750184535980225, "logps/chosen": -4.574479103088379, "logps/rejected": -0.6667912006378174, "loss": 0.9105, "rewards/accuracies": 0.0, "rewards/chosen": 0.7311847805976868, "rewards/margins": -0.39587587118148804, "rewards/rejected": 1.1270606517791748, "step": 7084 }, { "epoch": 3.82, "learning_rate": 5.215778161291429e-10, "logits/chosen": -2.1688809394836426, "logits/rejected": -2.2895689010620117, "logps/chosen": -5.237070083618164, "logps/rejected": -8.488235473632812, "loss": 0.5906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9035570025444031, "rewards/margins": 0.2168407440185547, "rewards/rejected": 0.6867162585258484, "step": 7085 }, { "epoch": 3.82, "learning_rate": 5.184364984960221e-10, "logits/chosen": -2.0882818698883057, "logits/rejected": -2.2334914207458496, "logps/chosen": -7.536352157592773, "logps/rejected": -10.415719985961914, "loss": 0.5453, "rewards/accuracies": 1.0, "rewards/chosen": 0.9986039996147156, "rewards/margins": 0.32131534814834595, "rewards/rejected": 0.6772886514663696, "step": 7086 }, { "epoch": 3.82, "learning_rate": 5.153046197986899e-10, "logits/chosen": -1.981993317604065, "logits/rejected": -1.981117606163025, "logps/chosen": -0.5345140695571899, "logps/rejected": -3.1382980346679688, "loss": 0.5512, "rewards/accuracies": 1.0, "rewards/chosen": 0.9482345581054688, "rewards/margins": 0.30741631984710693, "rewards/rejected": 0.6408182382583618, "step": 7087 }, { "epoch": 3.82, "learning_rate": 5.121821806345739e-10, "logits/chosen": -2.0712852478027344, "logits/rejected": -2.0735552310943604, "logps/chosen": -1.1721768379211426, "logps/rejected": -3.2272486686706543, "loss": 0.5154, "rewards/accuracies": 1.0, "rewards/chosen": 0.8983942270278931, "rewards/margins": 0.39410245418548584, "rewards/rejected": 0.5042917728424072, "step": 7088 }, { "epoch": 3.82, "learning_rate": 5.090691815993031e-10, "logits/chosen": -2.096465826034546, "logits/rejected": -2.1205291748046875, "logps/chosen": -3.0546841621398926, "logps/rejected": -6.0779337882995605, "loss": 0.4723, "rewards/accuracies": 1.0, "rewards/chosen": 1.188880205154419, "rewards/margins": 0.5047860741615295, "rewards/rejected": 0.6840941309928894, "step": 7089 }, { "epoch": 3.82, "learning_rate": 5.059656232867027e-10, "logits/chosen": -2.1452598571777344, "logits/rejected": -2.3292481899261475, "logps/chosen": -0.6093878746032715, "logps/rejected": -0.5975359082221985, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 1.029762864112854, "rewards/margins": 0.011773943901062012, "rewards/rejected": 1.017988920211792, "step": 7090 }, { "epoch": 3.82, "learning_rate": 5.028715062887878e-10, "logits/chosen": -2.055302381515503, "logits/rejected": -2.052710771560669, "logps/chosen": -0.471149206161499, "logps/rejected": -8.344730377197266, "loss": 0.3362, "rewards/accuracies": 1.0, "rewards/chosen": 1.137435793876648, "rewards/margins": 0.917152464389801, "rewards/rejected": 0.22028331458568573, "step": 7091 }, { "epoch": 3.83, "learning_rate": 4.99786831195792e-10, "logits/chosen": -2.0864462852478027, "logits/rejected": -2.0809876918792725, "logps/chosen": -7.0398359298706055, "logps/rejected": -10.327489852905273, "loss": 0.2304, "rewards/accuracies": 1.0, "rewards/chosen": 1.6306148767471313, "rewards/margins": 1.3506553173065186, "rewards/rejected": 0.279959499835968, "step": 7092 }, { "epoch": 3.83, "learning_rate": 4.967115985961334e-10, "logits/chosen": -2.085012197494507, "logits/rejected": -1.976576566696167, "logps/chosen": -22.29607391357422, "logps/rejected": -8.706429481506348, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 2.358154058456421, "rewards/margins": 2.1088945865631104, "rewards/rejected": 0.24925947189331055, "step": 7093 }, { "epoch": 3.83, "learning_rate": 4.936458090764372e-10, "logits/chosen": -2.1100759506225586, "logits/rejected": -2.114657163619995, "logps/chosen": -2.4898757934570312, "logps/rejected": -7.237432479858398, "loss": 0.3753, "rewards/accuracies": 1.0, "rewards/chosen": 1.6153310537338257, "rewards/margins": 0.7865194082260132, "rewards/rejected": 0.8288116455078125, "step": 7094 }, { "epoch": 3.83, "learning_rate": 4.905894632215136e-10, "logits/chosen": -2.1711294651031494, "logits/rejected": -2.3306615352630615, "logps/chosen": -0.649449348449707, "logps/rejected": -0.620586097240448, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 0.9613709449768066, "rewards/margins": -0.016282260417938232, "rewards/rejected": 0.9776532053947449, "step": 7095 }, { "epoch": 3.83, "learning_rate": 4.875425616143902e-10, "logits/chosen": -1.9810864925384521, "logits/rejected": -2.270094871520996, "logps/chosen": -0.5856778025627136, "logps/rejected": -0.6030192375183105, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 1.0606753826141357, "rewards/margins": 0.030957579612731934, "rewards/rejected": 1.0297178030014038, "step": 7096 }, { "epoch": 3.83, "learning_rate": 4.845051048362747e-10, "logits/chosen": -2.148189067840576, "logits/rejected": -2.15644907951355, "logps/chosen": -2.3466615676879883, "logps/rejected": -5.295205593109131, "loss": 0.3924, "rewards/accuracies": 1.0, "rewards/chosen": 1.1374287605285645, "rewards/margins": 0.7327388525009155, "rewards/rejected": 0.4046899378299713, "step": 7097 }, { "epoch": 3.83, "learning_rate": 4.814770934665924e-10, "logits/chosen": -2.104992151260376, "logits/rejected": -2.1120293140411377, "logps/chosen": -2.010507106781006, "logps/rejected": -4.351232051849365, "loss": 0.454, "rewards/accuracies": 1.0, "rewards/chosen": 1.0528137683868408, "rewards/margins": 0.5540581941604614, "rewards/rejected": 0.4987556040287018, "step": 7098 }, { "epoch": 3.83, "learning_rate": 4.784585280829478e-10, "logits/chosen": -2.076878547668457, "logits/rejected": -2.0794835090637207, "logps/chosen": -5.101560592651367, "logps/rejected": -4.230382442474365, "loss": 0.5343, "rewards/accuracies": 1.0, "rewards/chosen": 0.9398056268692017, "rewards/margins": 0.3478385806083679, "rewards/rejected": 0.5919670462608337, "step": 7099 }, { "epoch": 3.83, "learning_rate": 4.754494092611638e-10, "logits/chosen": -2.122103214263916, "logits/rejected": -1.9981379508972168, "logps/chosen": -27.86726188659668, "logps/rejected": -4.206643104553223, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 2.5283193588256836, "rewards/margins": 2.1131558418273926, "rewards/rejected": 0.4151636064052582, "step": 7100 }, { "epoch": 3.83, "learning_rate": 4.724497375752368e-10, "logits/chosen": -2.1802358627319336, "logits/rejected": -2.170513391494751, "logps/chosen": -0.6937621831893921, "logps/rejected": -6.019454002380371, "loss": 0.4259, "rewards/accuracies": 1.0, "rewards/chosen": 1.1186821460723877, "rewards/margins": 0.6329111456871033, "rewards/rejected": 0.4857710003852844, "step": 7101 }, { "epoch": 3.83, "learning_rate": 4.694595135973755e-10, "logits/chosen": -2.0444111824035645, "logits/rejected": -2.292341470718384, "logps/chosen": -2.1134533882141113, "logps/rejected": -2.3526039123535156, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.7808660268783569, "rewards/margins": 0.013824701309204102, "rewards/rejected": 0.7670413255691528, "step": 7102 }, { "epoch": 3.83, "learning_rate": 4.664787378979906e-10, "logits/chosen": -2.0543060302734375, "logits/rejected": -2.272946834564209, "logps/chosen": -0.35433894395828247, "logps/rejected": -0.44904035329818726, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.9609919786453247, "rewards/margins": 0.011655151844024658, "rewards/rejected": 0.9493368268013, "step": 7103 }, { "epoch": 3.83, "learning_rate": 4.6350741104568826e-10, "logits/chosen": -2.150367259979248, "logits/rejected": -2.310551404953003, "logps/chosen": -6.11409330368042, "logps/rejected": -6.274057388305664, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 1.2865995168685913, "rewards/margins": 0.03162837028503418, "rewards/rejected": 1.2549711465835571, "step": 7104 }, { "epoch": 3.83, "learning_rate": 4.605455336072539e-10, "logits/chosen": -2.1354870796203613, "logits/rejected": -2.1274209022521973, "logps/chosen": -0.5120244026184082, "logps/rejected": -14.4954833984375, "loss": 0.3395, "rewards/accuracies": 1.0, "rewards/chosen": 1.2464793920516968, "rewards/margins": 0.9056031703948975, "rewards/rejected": 0.34087619185447693, "step": 7105 }, { "epoch": 3.83, "learning_rate": 4.575931061477023e-10, "logits/chosen": -2.146723985671997, "logits/rejected": -2.136542558670044, "logps/chosen": -0.3343096077442169, "logps/rejected": -8.269883155822754, "loss": 0.3945, "rewards/accuracies": 1.0, "rewards/chosen": 1.177589774131775, "rewards/margins": 0.7264312505722046, "rewards/rejected": 0.4511585235595703, "step": 7106 }, { "epoch": 3.83, "learning_rate": 4.546501292302163e-10, "logits/chosen": -2.232109546661377, "logits/rejected": -2.3854119777679443, "logps/chosen": -0.48365989327430725, "logps/rejected": -0.5038862228393555, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 1.0407898426055908, "rewards/margins": 0.01373279094696045, "rewards/rejected": 1.0270570516586304, "step": 7107 }, { "epoch": 3.83, "learning_rate": 4.517166034161912e-10, "logits/chosen": -2.032036542892456, "logits/rejected": -2.0363292694091797, "logps/chosen": -1.8193552494049072, "logps/rejected": -8.659601211547852, "loss": 0.3459, "rewards/accuracies": 1.0, "rewards/chosen": 0.9886747598648071, "rewards/margins": 0.8836545944213867, "rewards/rejected": 0.10502014309167862, "step": 7108 }, { "epoch": 3.83, "learning_rate": 4.4879252926521837e-10, "logits/chosen": -2.037820816040039, "logits/rejected": -2.248204231262207, "logps/chosen": -8.20811939239502, "logps/rejected": -4.931790828704834, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.9378744959831238, "rewards/margins": 0.02585911750793457, "rewards/rejected": 0.9120153784751892, "step": 7109 }, { "epoch": 3.83, "learning_rate": 4.4587790733509043e-10, "logits/chosen": -2.143670082092285, "logits/rejected": -2.1311936378479004, "logps/chosen": -4.960925102233887, "logps/rejected": -9.438272476196289, "loss": 0.4573, "rewards/accuracies": 1.0, "rewards/chosen": 1.0434905290603638, "rewards/margins": 0.5451016426086426, "rewards/rejected": 0.4983888566493988, "step": 7110 }, { "epoch": 3.84, "learning_rate": 4.429727381817794e-10, "logits/chosen": -2.058776617050171, "logits/rejected": -2.2441489696502686, "logps/chosen": -2.3503470420837402, "logps/rejected": -2.474619150161743, "loss": 0.6712, "rewards/accuracies": 1.0, "rewards/chosen": 0.7051056623458862, "rewards/margins": 0.04439646005630493, "rewards/rejected": 0.6607092022895813, "step": 7111 }, { "epoch": 3.84, "learning_rate": 4.4007702235946963e-10, "logits/chosen": -2.1051158905029297, "logits/rejected": -2.111593008041382, "logps/chosen": -5.26140022277832, "logps/rejected": -11.231083869934082, "loss": 0.3859, "rewards/accuracies": 1.0, "rewards/chosen": 1.1079797744750977, "rewards/margins": 0.7529789805412292, "rewards/rejected": 0.3550007939338684, "step": 7112 }, { "epoch": 3.84, "learning_rate": 4.371907604205416e-10, "logits/chosen": -2.0758845806121826, "logits/rejected": -2.3619706630706787, "logps/chosen": -1.5703765153884888, "logps/rejected": -1.5926556587219238, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 1.0245531797409058, "rewards/margins": 0.017144441604614258, "rewards/rejected": 1.0074087381362915, "step": 7113 }, { "epoch": 3.84, "learning_rate": 4.343139529155659e-10, "logits/chosen": -2.0970773696899414, "logits/rejected": -2.0957179069519043, "logps/chosen": -4.5035223960876465, "logps/rejected": -4.046794414520264, "loss": 0.2402, "rewards/accuracies": 1.0, "rewards/chosen": 1.7511167526245117, "rewards/margins": 1.3036723136901855, "rewards/rejected": 0.44744449853897095, "step": 7114 }, { "epoch": 3.84, "learning_rate": 4.314466003933093e-10, "logits/chosen": -2.1101112365722656, "logits/rejected": -2.139665365219116, "logps/chosen": -3.0790510177612305, "logps/rejected": -6.7169976234436035, "loss": 0.3341, "rewards/accuracies": 1.0, "rewards/chosen": 1.6390098333358765, "rewards/margins": 0.9247881174087524, "rewards/rejected": 0.714221715927124, "step": 7115 }, { "epoch": 3.84, "learning_rate": 4.2858870340074517e-10, "logits/chosen": -2.072085380554199, "logits/rejected": -2.065372943878174, "logps/chosen": -2.823740005493164, "logps/rejected": -6.9720892906188965, "loss": 0.217, "rewards/accuracies": 1.0, "rewards/chosen": 1.5747184753417969, "rewards/margins": 1.417290210723877, "rewards/rejected": 0.15742821991443634, "step": 7116 }, { "epoch": 3.84, "learning_rate": 4.257402624830375e-10, "logits/chosen": -1.988466501235962, "logits/rejected": -2.294060468673706, "logps/chosen": -0.6930347084999084, "logps/rejected": -7.398386001586914, "loss": 0.523, "rewards/accuracies": 1.0, "rewards/chosen": 0.8707594275474548, "rewards/margins": 0.37533316016197205, "rewards/rejected": 0.4954262673854828, "step": 7117 }, { "epoch": 3.84, "learning_rate": 4.22901278183535e-10, "logits/chosen": -2.095881223678589, "logits/rejected": -2.096245288848877, "logps/chosen": -1.0492265224456787, "logps/rejected": -2.3825302124023438, "loss": 0.6039, "rewards/accuracies": 1.0, "rewards/chosen": 1.0557547807693481, "rewards/margins": 0.18719244003295898, "rewards/rejected": 0.8685623407363892, "step": 7118 }, { "epoch": 3.84, "learning_rate": 4.2007175104379344e-10, "logits/chosen": -2.146418809890747, "logits/rejected": -2.1570990085601807, "logps/chosen": -1.4547696113586426, "logps/rejected": -2.6133713722229004, "loss": 0.5166, "rewards/accuracies": 1.0, "rewards/chosen": 1.0338510274887085, "rewards/margins": 0.3910978436470032, "rewards/rejected": 0.6427531838417053, "step": 7119 }, { "epoch": 3.84, "learning_rate": 4.1725168160357535e-10, "logits/chosen": -2.0399794578552246, "logits/rejected": -2.3097946643829346, "logps/chosen": -0.1898847371339798, "logps/rejected": -0.2519649863243103, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.8970625996589661, "rewards/margins": 0.028056800365447998, "rewards/rejected": 0.8690057992935181, "step": 7120 }, { "epoch": 3.84, "learning_rate": 4.144410704008172e-10, "logits/chosen": -2.071669816970825, "logits/rejected": -2.2506678104400635, "logps/chosen": -3.8098862171173096, "logps/rejected": -5.782492637634277, "loss": 0.501, "rewards/accuracies": 1.0, "rewards/chosen": 0.9505776762962341, "rewards/margins": 0.4302293062210083, "rewards/rejected": 0.5203483700752258, "step": 7121 }, { "epoch": 3.84, "learning_rate": 4.116399179716679e-10, "logits/chosen": -2.0588414669036865, "logits/rejected": -2.3347277641296387, "logps/chosen": -0.136525958776474, "logps/rejected": -0.1346883773803711, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.9561290144920349, "rewards/margins": 0.006884098052978516, "rewards/rejected": 0.9492449164390564, "step": 7122 }, { "epoch": 3.84, "learning_rate": 4.0884822485046657e-10, "logits/chosen": -2.0730793476104736, "logits/rejected": -2.3281373977661133, "logps/chosen": -0.6871483325958252, "logps/rejected": -0.6488218903541565, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 1.037973165512085, "rewards/margins": 0.03856492042541504, "rewards/rejected": 0.9994082450866699, "step": 7123 }, { "epoch": 3.84, "learning_rate": 4.060659915697373e-10, "logits/chosen": -2.0071654319763184, "logits/rejected": -2.2323248386383057, "logps/chosen": -2.009671926498413, "logps/rejected": -1.7167141437530518, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.6631854176521301, "rewards/margins": -0.013878226280212402, "rewards/rejected": 0.6770636439323425, "step": 7124 }, { "epoch": 3.84, "learning_rate": 4.0329321866022226e-10, "logits/chosen": -2.1467740535736084, "logits/rejected": -2.138481378555298, "logps/chosen": -4.239274978637695, "logps/rejected": -4.668096542358398, "loss": 0.2857, "rewards/accuracies": 1.0, "rewards/chosen": 1.473415732383728, "rewards/margins": 1.1064656972885132, "rewards/rejected": 0.36695003509521484, "step": 7125 }, { "epoch": 3.84, "learning_rate": 4.005299066508372e-10, "logits/chosen": -2.2164504528045654, "logits/rejected": -2.1945888996124268, "logps/chosen": -4.383543968200684, "logps/rejected": -9.079427719116211, "loss": 0.3969, "rewards/accuracies": 1.0, "rewards/chosen": 1.1101118326187134, "rewards/margins": 0.7191417217254639, "rewards/rejected": 0.3909701406955719, "step": 7126 }, { "epoch": 3.84, "learning_rate": 3.977760560687049e-10, "logits/chosen": -1.994543433189392, "logits/rejected": -1.994364857673645, "logps/chosen": -0.9835872650146484, "logps/rejected": -1.0541892051696777, "loss": 0.5431, "rewards/accuracies": 1.0, "rewards/chosen": 1.196689486503601, "rewards/margins": 0.32666343450546265, "rewards/rejected": 0.8700260519981384, "step": 7127 }, { "epoch": 3.84, "learning_rate": 3.950316674391441e-10, "logits/chosen": -2.146958351135254, "logits/rejected": -2.1420693397521973, "logps/chosen": -6.1705322265625, "logps/rejected": -4.096560955047607, "loss": 0.2896, "rewards/accuracies": 1.0, "rewards/chosen": 1.6553837060928345, "rewards/margins": 1.0907938480377197, "rewards/rejected": 0.5645898580551147, "step": 7128 }, { "epoch": 3.85, "learning_rate": 3.9229674128565817e-10, "logits/chosen": -2.014490842819214, "logits/rejected": -2.2931628227233887, "logps/chosen": -0.6316046118736267, "logps/rejected": -0.650670051574707, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 1.0326988697052002, "rewards/margins": 0.0003380775451660156, "rewards/rejected": 1.0323607921600342, "step": 7129 }, { "epoch": 3.85, "learning_rate": 3.8957127812995763e-10, "logits/chosen": -2.033287286758423, "logits/rejected": -2.0280869007110596, "logps/chosen": -4.621897220611572, "logps/rejected": -2.7344319820404053, "loss": 0.392, "rewards/accuracies": 1.0, "rewards/chosen": 1.4299805164337158, "rewards/margins": 0.7341333627700806, "rewards/rejected": 0.6958471536636353, "step": 7130 }, { "epoch": 3.85, "learning_rate": 3.868552784919432e-10, "logits/chosen": -2.1173315048217773, "logits/rejected": -2.353787899017334, "logps/chosen": -0.28542113304138184, "logps/rejected": -0.30867522954940796, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.8895713686943054, "rewards/margins": 0.0004667043685913086, "rewards/rejected": 0.8891046643257141, "step": 7131 }, { "epoch": 3.85, "learning_rate": 3.8414874288970613e-10, "logits/chosen": -2.1850953102111816, "logits/rejected": -2.2891039848327637, "logps/chosen": -1.502435326576233, "logps/rejected": -1.5564216375350952, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 1.0441983938217163, "rewards/margins": 0.016049861907958984, "rewards/rejected": 1.0281485319137573, "step": 7132 }, { "epoch": 3.85, "learning_rate": 3.8145167183953885e-10, "logits/chosen": -2.080177068710327, "logits/rejected": -2.4303674697875977, "logps/chosen": -5.319489479064941, "logps/rejected": -18.473798751831055, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.8280091285705566, "rewards/margins": 0.03270137310028076, "rewards/rejected": 0.7953077554702759, "step": 7133 }, { "epoch": 3.85, "learning_rate": 3.7876406585592437e-10, "logits/chosen": -2.0718917846679688, "logits/rejected": -2.075269937515259, "logps/chosen": -4.580883979797363, "logps/rejected": -0.4346521198749542, "loss": 0.5393, "rewards/accuracies": 1.0, "rewards/chosen": 1.2882102727890015, "rewards/margins": 0.3357359766960144, "rewards/rejected": 0.9524742960929871, "step": 7134 }, { "epoch": 3.85, "learning_rate": 3.7608592545153586e-10, "logits/chosen": -2.005324363708496, "logits/rejected": -2.0119240283966064, "logps/chosen": -2.353179454803467, "logps/rejected": -3.2476696968078613, "loss": 0.3995, "rewards/accuracies": 1.0, "rewards/chosen": 1.2998857498168945, "rewards/margins": 0.7110288143157959, "rewards/rejected": 0.5888569355010986, "step": 7135 }, { "epoch": 3.85, "learning_rate": 3.734172511372591e-10, "logits/chosen": -2.197122812271118, "logits/rejected": -2.3664848804473877, "logps/chosen": -5.65982723236084, "logps/rejected": -0.7701405882835388, "loss": 0.6603, "rewards/accuracies": 1.0, "rewards/chosen": 1.1601370573043823, "rewards/margins": 0.0668562650680542, "rewards/rejected": 1.0932807922363281, "step": 7136 }, { "epoch": 3.85, "learning_rate": 3.70758043422148e-10, "logits/chosen": -2.143564224243164, "logits/rejected": -2.2183101177215576, "logps/chosen": -4.73557710647583, "logps/rejected": -24.420154571533203, "loss": 0.6115, "rewards/accuracies": 1.0, "rewards/chosen": 0.9938578009605408, "rewards/margins": 0.17059040069580078, "rewards/rejected": 0.82326740026474, "step": 7137 }, { "epoch": 3.85, "learning_rate": 3.681083028134746e-10, "logits/chosen": -2.0109496116638184, "logits/rejected": -2.2845818996429443, "logps/chosen": -0.5269367098808289, "logps/rejected": -0.7385258674621582, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.803226888179779, "rewards/margins": 0.02608412504196167, "rewards/rejected": 0.7771427631378174, "step": 7138 }, { "epoch": 3.85, "learning_rate": 3.6546802981668457e-10, "logits/chosen": -2.1145870685577393, "logits/rejected": -2.268770694732666, "logps/chosen": -2.183241605758667, "logps/rejected": -6.8198065757751465, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 0.7208318114280701, "rewards/margins": 0.27580544352531433, "rewards/rejected": 0.44502636790275574, "step": 7139 }, { "epoch": 3.85, "learning_rate": 3.6283722493543613e-10, "logits/chosen": -1.9656609296798706, "logits/rejected": -2.284428358078003, "logps/chosen": -1.193951964378357, "logps/rejected": -1.2063909769058228, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9819144606590271, "rewards/margins": 0.007779717445373535, "rewards/rejected": 0.9741347432136536, "step": 7140 }, { "epoch": 3.85, "learning_rate": 3.602158886715667e-10, "logits/chosen": -2.130239963531494, "logits/rejected": -2.2435946464538574, "logps/chosen": -2.190260887145996, "logps/rejected": -2.1199851036071777, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.9564122557640076, "rewards/margins": 0.0120011568069458, "rewards/rejected": 0.9444110989570618, "step": 7141 }, { "epoch": 3.85, "learning_rate": 3.576040215251208e-10, "logits/chosen": -2.1441898345947266, "logits/rejected": -2.3785054683685303, "logps/chosen": -0.819513738155365, "logps/rejected": -0.828482985496521, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 1.0270671844482422, "rewards/margins": 0.016674518585205078, "rewards/rejected": 1.010392665863037, "step": 7142 }, { "epoch": 3.85, "learning_rate": 3.550016239943221e-10, "logits/chosen": -2.0441524982452393, "logits/rejected": -2.040442943572998, "logps/chosen": -1.1273066997528076, "logps/rejected": -9.790465354919434, "loss": 0.3223, "rewards/accuracies": 1.0, "rewards/chosen": 1.0204603672027588, "rewards/margins": 0.9667085409164429, "rewards/rejected": 0.05375185236334801, "step": 7143 }, { "epoch": 3.85, "learning_rate": 3.524086965755957e-10, "logits/chosen": -2.058683395385742, "logits/rejected": -2.2586669921875, "logps/chosen": -0.1857401579618454, "logps/rejected": -0.2037534862756729, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.842640221118927, "rewards/margins": 0.01603710651397705, "rewards/rejected": 0.82660311460495, "step": 7144 }, { "epoch": 3.85, "learning_rate": 3.498252397635626e-10, "logits/chosen": -2.079413414001465, "logits/rejected": -2.2627367973327637, "logps/chosen": -0.3363291919231415, "logps/rejected": -0.30302783846855164, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.875298798084259, "rewards/margins": 0.011746108531951904, "rewards/rejected": 0.8635526895523071, "step": 7145 }, { "epoch": 3.85, "learning_rate": 3.472512540510342e-10, "logits/chosen": -2.2045154571533203, "logits/rejected": -2.298462152481079, "logps/chosen": -1.3591159582138062, "logps/rejected": -1.388180136680603, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9887446761131287, "rewards/margins": 0.015095233917236328, "rewards/rejected": 0.9736494421958923, "step": 7146 }, { "epoch": 3.85, "learning_rate": 3.4468673992901765e-10, "logits/chosen": -2.075561761856079, "logits/rejected": -2.364581346511841, "logps/chosen": -6.200873851776123, "logps/rejected": -7.389508247375488, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 1.1439119577407837, "rewards/margins": -0.014383792877197266, "rewards/rejected": 1.158295750617981, "step": 7147 }, { "epoch": 3.86, "learning_rate": 3.421316978867106e-10, "logits/chosen": -2.008014678955078, "logits/rejected": -1.940224289894104, "logps/chosen": -13.145285606384277, "logps/rejected": -1.847281575202942, "loss": 0.2698, "rewards/accuracies": 1.0, "rewards/chosen": 2.044837236404419, "rewards/margins": 1.1720201969146729, "rewards/rejected": 0.8728169798851013, "step": 7148 }, { "epoch": 3.86, "learning_rate": 3.3958612841149537e-10, "logits/chosen": -2.028451442718506, "logits/rejected": -2.0227434635162354, "logps/chosen": -1.5074551105499268, "logps/rejected": -4.727617263793945, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 1.218849539756775, "rewards/margins": 0.6526849269866943, "rewards/rejected": 0.5661646127700806, "step": 7149 }, { "epoch": 3.86, "learning_rate": 3.3705003198896133e-10, "logits/chosen": -2.172266721725464, "logits/rejected": -2.165576696395874, "logps/chosen": -3.5411183834075928, "logps/rejected": -9.188478469848633, "loss": 0.2443, "rewards/accuracies": 1.0, "rewards/chosen": 1.4561424255371094, "rewards/margins": 1.2846601009368896, "rewards/rejected": 0.1714823693037033, "step": 7150 }, { "epoch": 3.86, "learning_rate": 3.3452340910288813e-10, "logits/chosen": -2.1517927646636963, "logits/rejected": -2.065500259399414, "logps/chosen": -11.838587760925293, "logps/rejected": -4.150600433349609, "loss": 0.3061, "rewards/accuracies": 1.0, "rewards/chosen": 1.778342843055725, "rewards/margins": 1.0268914699554443, "rewards/rejected": 0.751451313495636, "step": 7151 }, { "epoch": 3.86, "learning_rate": 3.320062602352458e-10, "logits/chosen": -2.1760365962982178, "logits/rejected": -2.3063204288482666, "logps/chosen": -1.9484373331069946, "logps/rejected": -2.0117878913879395, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.9848968386650085, "rewards/margins": 0.0038036108016967773, "rewards/rejected": 0.9810932278633118, "step": 7152 }, { "epoch": 3.86, "learning_rate": 3.2949858586619474e-10, "logits/chosen": -2.231053113937378, "logits/rejected": -2.216233015060425, "logps/chosen": -10.460609436035156, "logps/rejected": -6.221425533294678, "loss": 0.5289, "rewards/accuracies": 1.0, "rewards/chosen": 1.2533825635910034, "rewards/margins": 0.3609907627105713, "rewards/rejected": 0.8923918008804321, "step": 7153 }, { "epoch": 3.86, "learning_rate": 3.2700038647409113e-10, "logits/chosen": -2.0079660415649414, "logits/rejected": -2.32743239402771, "logps/chosen": -7.4702229499816895, "logps/rejected": -5.4930925369262695, "loss": 0.7389, "rewards/accuracies": 0.0, "rewards/chosen": 1.0938106775283813, "rewards/margins": -0.08944535255432129, "rewards/rejected": 1.1832560300827026, "step": 7154 }, { "epoch": 3.86, "learning_rate": 3.2451166253548156e-10, "logits/chosen": -2.172800064086914, "logits/rejected": -2.327766180038452, "logps/chosen": -0.4197663366794586, "logps/rejected": -0.4725591540336609, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 1.1225398778915405, "rewards/margins": 0.006270766258239746, "rewards/rejected": 1.1162691116333008, "step": 7155 }, { "epoch": 3.86, "learning_rate": 3.220324145251141e-10, "logits/chosen": -2.159642457962036, "logits/rejected": -2.1426544189453125, "logps/chosen": -1.2404749393463135, "logps/rejected": -8.8939790725708, "loss": 0.3255, "rewards/accuracies": 1.0, "rewards/chosen": 1.207593321800232, "rewards/margins": 0.9550991058349609, "rewards/rejected": 0.2524942457675934, "step": 7156 }, { "epoch": 3.86, "learning_rate": 3.195626429159104e-10, "logits/chosen": -2.1815414428710938, "logits/rejected": -2.298069477081299, "logps/chosen": -1.3027760982513428, "logps/rejected": -1.913482427597046, "loss": 0.6591, "rewards/accuracies": 1.0, "rewards/chosen": 1.0719749927520752, "rewards/margins": 0.06939089298248291, "rewards/rejected": 1.0025840997695923, "step": 7157 }, { "epoch": 3.86, "learning_rate": 3.1710234817900474e-10, "logits/chosen": -1.959067940711975, "logits/rejected": -1.951894760131836, "logps/chosen": -2.7843289375305176, "logps/rejected": -7.164011001586914, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 0.851283848285675, "rewards/margins": 0.5634479522705078, "rewards/rejected": 0.28783589601516724, "step": 7158 }, { "epoch": 3.86, "learning_rate": 3.1465153078371055e-10, "logits/chosen": -2.0961287021636963, "logits/rejected": -2.0954318046569824, "logps/chosen": -0.6191606521606445, "logps/rejected": -3.0052943229675293, "loss": 0.573, "rewards/accuracies": 1.0, "rewards/chosen": 1.1413975954055786, "rewards/margins": 0.25680476427078247, "rewards/rejected": 0.8845928311347961, "step": 7159 }, { "epoch": 3.86, "learning_rate": 3.122101911975372e-10, "logits/chosen": -2.042548894882202, "logits/rejected": -2.2743303775787354, "logps/chosen": -0.4469054341316223, "logps/rejected": -5.131623268127441, "loss": 0.5115, "rewards/accuracies": 1.0, "rewards/chosen": 1.1478157043457031, "rewards/margins": 0.40387821197509766, "rewards/rejected": 0.7439374923706055, "step": 7160 }, { "epoch": 3.86, "learning_rate": 3.0977832988618444e-10, "logits/chosen": -2.2232134342193604, "logits/rejected": -2.2264740467071533, "logps/chosen": -0.6799437999725342, "logps/rejected": -4.505736351013184, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 1.037598967552185, "rewards/margins": 0.481825590133667, "rewards/rejected": 0.5557733774185181, "step": 7161 }, { "epoch": 3.86, "learning_rate": 3.073559473135534e-10, "logits/chosen": -2.0727241039276123, "logits/rejected": -2.0658724308013916, "logps/chosen": -13.61944580078125, "logps/rejected": -10.20746898651123, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 1.5158637762069702, "rewards/margins": 1.3454558849334717, "rewards/rejected": 0.17040787637233734, "step": 7162 }, { "epoch": 3.86, "learning_rate": 3.049430439417244e-10, "logits/chosen": -2.180115222930908, "logits/rejected": -2.1855437755584717, "logps/chosen": -2.463758707046509, "logps/rejected": -4.873543739318848, "loss": 0.3685, "rewards/accuracies": 1.0, "rewards/chosen": 1.2460744380950928, "rewards/margins": 0.8085310459136963, "rewards/rejected": 0.4375433921813965, "step": 7163 }, { "epoch": 3.86, "learning_rate": 3.025396202309738e-10, "logits/chosen": -2.090872049331665, "logits/rejected": -1.9872132539749146, "logps/chosen": -19.40962028503418, "logps/rejected": -2.8208746910095215, "loss": 0.1105, "rewards/accuracies": 1.0, "rewards/chosen": 2.61997127532959, "rewards/margins": 2.1474108695983887, "rewards/rejected": 0.4725603759288788, "step": 7164 }, { "epoch": 3.86, "learning_rate": 3.0014567663977365e-10, "logits/chosen": -2.2154924869537354, "logits/rejected": -2.244398593902588, "logps/chosen": -0.5002664923667908, "logps/rejected": -10.081165313720703, "loss": 0.49, "rewards/accuracies": 1.0, "rewards/chosen": 0.955165684223175, "rewards/margins": 0.4583854675292969, "rewards/rejected": 0.4967802166938782, "step": 7165 }, { "epoch": 3.87, "learning_rate": 2.977612136247809e-10, "logits/chosen": -2.0408060550689697, "logits/rejected": -2.038355588912964, "logps/chosen": -0.6781864166259766, "logps/rejected": -4.621775150299072, "loss": 0.4803, "rewards/accuracies": 1.0, "rewards/chosen": 0.962404727935791, "rewards/margins": 0.48347964882850647, "rewards/rejected": 0.47892507910728455, "step": 7166 }, { "epoch": 3.87, "learning_rate": 2.9538623164085394e-10, "logits/chosen": -2.0545923709869385, "logits/rejected": -2.260934829711914, "logps/chosen": -0.38422685861587524, "logps/rejected": -0.4799971878528595, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 0.8312262892723083, "rewards/margins": -0.011954545974731445, "rewards/rejected": 0.8431808352470398, "step": 7167 }, { "epoch": 3.87, "learning_rate": 2.9302073114103044e-10, "logits/chosen": -2.058426856994629, "logits/rejected": -2.0584895610809326, "logps/chosen": -5.175625324249268, "logps/rejected": -3.007319450378418, "loss": 0.2831, "rewards/accuracies": 1.0, "rewards/chosen": 1.673987627029419, "rewards/margins": 1.116997241973877, "rewards/rejected": 0.5569903254508972, "step": 7168 }, { "epoch": 3.87, "learning_rate": 2.906647125765438e-10, "logits/chosen": -2.1651175022125244, "logits/rejected": -2.1547319889068604, "logps/chosen": -8.185896873474121, "logps/rejected": -3.349059581756592, "loss": 0.6515, "rewards/accuracies": 1.0, "rewards/chosen": 0.6505865454673767, "rewards/margins": 0.085155189037323, "rewards/rejected": 0.5654313564300537, "step": 7169 }, { "epoch": 3.87, "learning_rate": 2.8831817639682343e-10, "logits/chosen": -2.0734610557556152, "logits/rejected": -2.065110921859741, "logps/chosen": -2.568773031234741, "logps/rejected": -5.173392295837402, "loss": 0.4631, "rewards/accuracies": 1.0, "rewards/chosen": 1.0387881994247437, "rewards/margins": 0.5292351245880127, "rewards/rejected": 0.509553074836731, "step": 7170 }, { "epoch": 3.87, "learning_rate": 2.8598112304948907e-10, "logits/chosen": -2.0009140968322754, "logits/rejected": -2.0008492469787598, "logps/chosen": -0.4925820827484131, "logps/rejected": -2.448367118835449, "loss": 0.5118, "rewards/accuracies": 1.0, "rewards/chosen": 1.026843547821045, "rewards/margins": 0.4029102325439453, "rewards/rejected": 0.6239333152770996, "step": 7171 }, { "epoch": 3.87, "learning_rate": 2.8365355298034523e-10, "logits/chosen": -2.0890421867370605, "logits/rejected": -2.2828800678253174, "logps/chosen": -0.13116775453090668, "logps/rejected": -0.17239293456077576, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8828482031822205, "rewards/margins": 0.019189834594726562, "rewards/rejected": 0.8636583685874939, "step": 7172 }, { "epoch": 3.87, "learning_rate": 2.8133546663339224e-10, "logits/chosen": -2.174569606781006, "logits/rejected": -2.1402666568756104, "logps/chosen": -13.197818756103516, "logps/rejected": -6.1277689933776855, "loss": 0.3343, "rewards/accuracies": 1.0, "rewards/chosen": 1.7745636701583862, "rewards/margins": 0.9238827228546143, "rewards/rejected": 0.850680947303772, "step": 7173 }, { "epoch": 3.87, "learning_rate": 2.790268644508265e-10, "logits/chosen": -2.0826218128204346, "logits/rejected": -2.279198408126831, "logps/chosen": -0.4293704032897949, "logps/rejected": -0.4339240491390228, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 1.1808366775512695, "rewards/margins": -0.003997683525085449, "rewards/rejected": 1.184834361076355, "step": 7174 }, { "epoch": 3.87, "learning_rate": 2.767277468730123e-10, "logits/chosen": -2.1111273765563965, "logits/rejected": -2.2772982120513916, "logps/chosen": -3.1804866790771484, "logps/rejected": -1.566666841506958, "loss": 0.7275, "rewards/accuracies": 0.0, "rewards/chosen": 0.9521476626396179, "rewards/margins": -0.06748002767562866, "rewards/rejected": 1.0196276903152466, "step": 7175 }, { "epoch": 3.87, "learning_rate": 2.7443811433854323e-10, "logits/chosen": -2.2729649543762207, "logits/rejected": -2.297755479812622, "logps/chosen": -0.7329203486442566, "logps/rejected": -0.6143339276313782, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9308615922927856, "rewards/margins": 0.0036981701850891113, "rewards/rejected": 0.9271634221076965, "step": 7176 }, { "epoch": 3.87, "learning_rate": 2.721579672841645e-10, "logits/chosen": -1.9678006172180176, "logits/rejected": -1.9643839597702026, "logps/chosen": -4.265043258666992, "logps/rejected": -5.87144660949707, "loss": 0.2919, "rewards/accuracies": 1.0, "rewards/chosen": 1.4705766439437866, "rewards/margins": 1.0818437337875366, "rewards/rejected": 0.38873291015625, "step": 7177 }, { "epoch": 3.87, "learning_rate": 2.6988730614483924e-10, "logits/chosen": -2.060373067855835, "logits/rejected": -2.045881748199463, "logps/chosen": -1.6337579488754272, "logps/rejected": -5.228241443634033, "loss": 0.439, "rewards/accuracies": 1.0, "rewards/chosen": 1.4365408420562744, "rewards/margins": 0.5956282019615173, "rewards/rejected": 0.8409126400947571, "step": 7178 }, { "epoch": 3.87, "learning_rate": 2.6762613135370983e-10, "logits/chosen": -2.009645938873291, "logits/rejected": -2.0161073207855225, "logps/chosen": -1.5570383071899414, "logps/rejected": -3.7012994289398193, "loss": 0.4147, "rewards/accuracies": 1.0, "rewards/chosen": 1.2007050514221191, "rewards/margins": 0.6656619906425476, "rewards/rejected": 0.5350430607795715, "step": 7179 }, { "epoch": 3.87, "learning_rate": 2.6537444334210344e-10, "logits/chosen": -2.1899518966674805, "logits/rejected": -2.194131374359131, "logps/chosen": -0.11746010184288025, "logps/rejected": -5.37483024597168, "loss": 0.43, "rewards/accuracies": 1.0, "rewards/chosen": 0.905523955821991, "rewards/margins": 0.6211868524551392, "rewards/rejected": 0.2843371331691742, "step": 7180 }, { "epoch": 3.87, "learning_rate": 2.6313224253955434e-10, "logits/chosen": -2.034302234649658, "logits/rejected": -2.0357754230499268, "logps/chosen": -1.8795628547668457, "logps/rejected": -5.167143821716309, "loss": 0.2392, "rewards/accuracies": 1.0, "rewards/chosen": 1.7229022979736328, "rewards/margins": 1.3082753419876099, "rewards/rejected": 0.41462698578834534, "step": 7181 }, { "epoch": 3.87, "learning_rate": 2.608995293737704e-10, "logits/chosen": -2.0913639068603516, "logits/rejected": -2.273690938949585, "logps/chosen": -0.8436971306800842, "logps/rejected": -0.8004404902458191, "loss": 0.7379, "rewards/accuracies": 0.0, "rewards/chosen": 0.8295693397521973, "rewards/margins": -0.08750277757644653, "rewards/rejected": 0.9170721173286438, "step": 7182 }, { "epoch": 3.87, "learning_rate": 2.586763042706552e-10, "logits/chosen": -2.089909315109253, "logits/rejected": -2.080709934234619, "logps/chosen": -4.677562236785889, "logps/rejected": -0.9068135619163513, "loss": 0.4544, "rewards/accuracies": 1.0, "rewards/chosen": 1.5555305480957031, "rewards/margins": 0.5529987812042236, "rewards/rejected": 1.0025317668914795, "step": 7183 }, { "epoch": 3.87, "learning_rate": 2.564625676543142e-10, "logits/chosen": -2.061842203140259, "logits/rejected": -2.1828155517578125, "logps/chosen": -0.3431408703327179, "logps/rejected": -15.803793907165527, "loss": 0.5407, "rewards/accuracies": 1.0, "rewards/chosen": 0.9906836748123169, "rewards/margins": 0.33237171173095703, "rewards/rejected": 0.6583119630813599, "step": 7184 }, { "epoch": 3.88, "learning_rate": 2.542583199470205e-10, "logits/chosen": -2.06391978263855, "logits/rejected": -2.270414113998413, "logps/chosen": -0.27227675914764404, "logps/rejected": -0.32984766364097595, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.814337432384491, "rewards/margins": 0.025451600551605225, "rewards/rejected": 0.7888858318328857, "step": 7185 }, { "epoch": 3.88, "learning_rate": 2.5206356156924904e-10, "logits/chosen": -2.069446325302124, "logits/rejected": -2.062795639038086, "logps/chosen": -7.844536781311035, "logps/rejected": -8.408649444580078, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 1.1102298498153687, "rewards/margins": 0.6863135099411011, "rewards/rejected": 0.4239163398742676, "step": 7186 }, { "epoch": 3.88, "learning_rate": 2.4987829293967034e-10, "logits/chosen": -2.0502893924713135, "logits/rejected": -2.043201446533203, "logps/chosen": -5.204924583435059, "logps/rejected": -2.3755509853363037, "loss": 0.477, "rewards/accuracies": 1.0, "rewards/chosen": 1.2559914588928223, "rewards/margins": 0.49224191904067993, "rewards/rejected": 0.7637495398521423, "step": 7187 }, { "epoch": 3.88, "learning_rate": 2.477025144751399e-10, "logits/chosen": -2.198970079421997, "logits/rejected": -2.2017831802368164, "logps/chosen": -1.3262165784835815, "logps/rejected": -0.955827534198761, "loss": 0.6066, "rewards/accuracies": 1.0, "rewards/chosen": 1.2392029762268066, "rewards/margins": 0.18125736713409424, "rewards/rejected": 1.0579456090927124, "step": 7188 }, { "epoch": 3.88, "learning_rate": 2.455362265907035e-10, "logits/chosen": -2.067445755004883, "logits/rejected": -2.269883155822754, "logps/chosen": -0.6876774430274963, "logps/rejected": -0.7141457796096802, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.727325975894928, "rewards/margins": 0.007835209369659424, "rewards/rejected": 0.7194907665252686, "step": 7189 }, { "epoch": 3.88, "learning_rate": 2.4337942969958613e-10, "logits/chosen": -2.0805106163024902, "logits/rejected": -2.265933036804199, "logps/chosen": -5.263813018798828, "logps/rejected": -2.7507052421569824, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.9719921350479126, "rewards/margins": 0.03029632568359375, "rewards/rejected": 0.9416958093643188, "step": 7190 }, { "epoch": 3.88, "learning_rate": 2.4123212421321424e-10, "logits/chosen": -2.1769134998321533, "logits/rejected": -2.2918548583984375, "logps/chosen": -4.445605278015137, "logps/rejected": -0.4250929355621338, "loss": 0.7023, "rewards/accuracies": 0.0, "rewards/chosen": 0.9659603238105774, "rewards/margins": -0.01822221279144287, "rewards/rejected": 0.9841825366020203, "step": 7191 }, { "epoch": 3.88, "learning_rate": 2.3909431054120465e-10, "logits/chosen": -2.141977071762085, "logits/rejected": -2.2851216793060303, "logps/chosen": -2.2176945209503174, "logps/rejected": -2.2269043922424316, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 1.0056012868881226, "rewards/margins": 0.031419336795806885, "rewards/rejected": 0.9741819500923157, "step": 7192 }, { "epoch": 3.88, "learning_rate": 2.3696598909135335e-10, "logits/chosen": -2.152442693710327, "logits/rejected": -2.1609485149383545, "logps/chosen": -1.863091230392456, "logps/rejected": -3.4710206985473633, "loss": 0.4218, "rewards/accuracies": 1.0, "rewards/chosen": 1.328997015953064, "rewards/margins": 0.6449458003044128, "rewards/rejected": 0.6840512156486511, "step": 7193 }, { "epoch": 3.88, "learning_rate": 2.3484716026965776e-10, "logits/chosen": -2.0763020515441895, "logits/rejected": -2.1662087440490723, "logps/chosen": -2.6065256595611572, "logps/rejected": -18.920650482177734, "loss": 0.4019, "rewards/accuracies": 1.0, "rewards/chosen": 1.3620750904083252, "rewards/margins": 0.7039169669151306, "rewards/rejected": 0.6581581234931946, "step": 7194 }, { "epoch": 3.88, "learning_rate": 2.3273782448030023e-10, "logits/chosen": -2.3062570095062256, "logits/rejected": -2.440182685852051, "logps/chosen": -7.021521091461182, "logps/rejected": -12.601926803588867, "loss": 0.5858, "rewards/accuracies": 1.0, "rewards/chosen": 1.2777022123336792, "rewards/margins": 0.2276238203048706, "rewards/rejected": 1.0500783920288086, "step": 7195 }, { "epoch": 3.88, "learning_rate": 2.3063798212564212e-10, "logits/chosen": -2.0850470066070557, "logits/rejected": -2.0886154174804688, "logps/chosen": -0.504864513874054, "logps/rejected": -3.935222864151001, "loss": 0.4528, "rewards/accuracies": 1.0, "rewards/chosen": 0.965120792388916, "rewards/margins": 0.5574455261230469, "rewards/rejected": 0.40767526626586914, "step": 7196 }, { "epoch": 3.88, "learning_rate": 2.2854763360624085e-10, "logits/chosen": -2.132338285446167, "logits/rejected": -2.285404682159424, "logps/chosen": -0.9551661610603333, "logps/rejected": -1.1202001571655273, "loss": 0.6696, "rewards/accuracies": 1.0, "rewards/chosen": 0.9228828549385071, "rewards/margins": 0.04771071672439575, "rewards/rejected": 0.8751721382141113, "step": 7197 }, { "epoch": 3.88, "learning_rate": 2.2646677932085522e-10, "logits/chosen": -1.9809422492980957, "logits/rejected": -2.2736549377441406, "logps/chosen": -2.165559768676758, "logps/rejected": -2.059840679168701, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.7156373858451843, "rewards/margins": 0.03522235155105591, "rewards/rejected": 0.6804150342941284, "step": 7198 }, { "epoch": 3.88, "learning_rate": 2.2439541966641772e-10, "logits/chosen": -2.1822805404663086, "logits/rejected": -2.149775505065918, "logps/chosen": -20.095783233642578, "logps/rejected": -5.304768085479736, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": 1.8739837408065796, "rewards/margins": 1.4342305660247803, "rewards/rejected": 0.43975311517715454, "step": 7199 }, { "epoch": 3.88, "learning_rate": 2.2233355503805117e-10, "logits/chosen": -2.07789945602417, "logits/rejected": -2.3666231632232666, "logps/chosen": -1.9689583778381348, "logps/rejected": -1.9771064519882202, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 1.0049699544906616, "rewards/margins": 0.005891203880310059, "rewards/rejected": 0.9990787506103516, "step": 7200 }, { "epoch": 3.88, "learning_rate": 2.2028118582906875e-10, "logits/chosen": -2.1095120906829834, "logits/rejected": -2.3425545692443848, "logps/chosen": -0.9432586431503296, "logps/rejected": -0.9481560587882996, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.8134672045707703, "rewards/margins": 0.019405782222747803, "rewards/rejected": 0.7940614223480225, "step": 7201 }, { "epoch": 3.88, "learning_rate": 2.1823831243097945e-10, "logits/chosen": -2.0821194648742676, "logits/rejected": -2.0718984603881836, "logps/chosen": -5.30555534362793, "logps/rejected": -5.6963324546813965, "loss": 0.3666, "rewards/accuracies": 1.0, "rewards/chosen": 1.2353180646896362, "rewards/margins": 0.8144348859786987, "rewards/rejected": 0.4208831489086151, "step": 7202 }, { "epoch": 3.89, "learning_rate": 2.162049352334716e-10, "logits/chosen": -2.1556193828582764, "logits/rejected": -2.2812209129333496, "logps/chosen": -1.9319276809692383, "logps/rejected": -1.769951581954956, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 0.9600753784179688, "rewards/margins": -0.011948049068450928, "rewards/rejected": 0.9720234274864197, "step": 7203 }, { "epoch": 3.89, "learning_rate": 2.141810546244238e-10, "logits/chosen": -2.118818521499634, "logits/rejected": -2.103048801422119, "logps/chosen": -12.516727447509766, "logps/rejected": -5.832356929779053, "loss": 0.2081, "rewards/accuracies": 1.0, "rewards/chosen": 1.9148199558258057, "rewards/margins": 1.4638186693191528, "rewards/rejected": 0.4510013163089752, "step": 7204 }, { "epoch": 3.89, "learning_rate": 2.1216667098990504e-10, "logits/chosen": -2.2270350456237793, "logits/rejected": -2.223170518875122, "logps/chosen": -5.849445343017578, "logps/rejected": -4.381643295288086, "loss": 0.4462, "rewards/accuracies": 1.0, "rewards/chosen": 0.9864988327026367, "rewards/margins": 0.5757293701171875, "rewards/rejected": 0.4107694625854492, "step": 7205 }, { "epoch": 3.89, "learning_rate": 2.1016178471417455e-10, "logits/chosen": -2.067970037460327, "logits/rejected": -2.339716911315918, "logps/chosen": -2.6184792518615723, "logps/rejected": -2.679762125015259, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.7235127091407776, "rewards/margins": 0.02922302484512329, "rewards/rejected": 0.6942896842956543, "step": 7206 }, { "epoch": 3.89, "learning_rate": 2.0816639617967645e-10, "logits/chosen": -1.988317608833313, "logits/rejected": -2.2633955478668213, "logps/chosen": -0.2530129849910736, "logps/rejected": -0.23496396839618683, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.7917191982269287, "rewards/margins": -4.45246696472168e-05, "rewards/rejected": 0.7917637228965759, "step": 7207 }, { "epoch": 3.89, "learning_rate": 2.0618050576704516e-10, "logits/chosen": -2.0405569076538086, "logits/rejected": -2.0418195724487305, "logps/chosen": -3.9227240085601807, "logps/rejected": -2.607759714126587, "loss": 0.3055, "rewards/accuracies": 1.0, "rewards/chosen": 1.5925827026367188, "rewards/margins": 1.029043436050415, "rewards/rejected": 0.5635392069816589, "step": 7208 }, { "epoch": 3.89, "learning_rate": 2.042041138551054e-10, "logits/chosen": -2.164083480834961, "logits/rejected": -2.175365924835205, "logps/chosen": -2.595177173614502, "logps/rejected": -5.132896423339844, "loss": 0.4384, "rewards/accuracies": 1.0, "rewards/chosen": 1.0056383609771729, "rewards/margins": 0.5973648428916931, "rewards/rejected": 0.40827351808547974, "step": 7209 }, { "epoch": 3.89, "learning_rate": 2.022372208208556e-10, "logits/chosen": -1.978536605834961, "logits/rejected": -2.3091752529144287, "logps/chosen": -3.1093711853027344, "logps/rejected": -1.5092802047729492, "loss": 0.7903, "rewards/accuracies": 0.0, "rewards/chosen": 0.6223184466362, "rewards/margins": -0.18569231033325195, "rewards/rejected": 0.8080107569694519, "step": 7210 }, { "epoch": 3.89, "learning_rate": 2.0027982703950676e-10, "logits/chosen": -2.1216232776641846, "logits/rejected": -2.296353340148926, "logps/chosen": -0.22787770628929138, "logps/rejected": -0.24335594475269318, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.7786765098571777, "rewards/margins": 0.027501165866851807, "rewards/rejected": 0.7511753439903259, "step": 7211 }, { "epoch": 3.89, "learning_rate": 1.9833193288443796e-10, "logits/chosen": -2.1665258407592773, "logits/rejected": -2.318895101547241, "logps/chosen": -0.28657135367393494, "logps/rejected": -0.3151977062225342, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9606102108955383, "rewards/margins": 0.01693934202194214, "rewards/rejected": 0.9436708688735962, "step": 7212 }, { "epoch": 3.89, "learning_rate": 1.963935387272242e-10, "logits/chosen": -2.1092894077301025, "logits/rejected": -2.1154003143310547, "logps/chosen": -0.856825590133667, "logps/rejected": -3.2127108573913574, "loss": 0.4455, "rewards/accuracies": 1.0, "rewards/chosen": 1.1435636281967163, "rewards/margins": 0.5775224566459656, "rewards/rejected": 0.5660411715507507, "step": 7213 }, { "epoch": 3.89, "learning_rate": 1.9446464493762526e-10, "logits/chosen": -2.1411917209625244, "logits/rejected": -2.271639823913574, "logps/chosen": -0.09680087864398956, "logps/rejected": -0.09330016374588013, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.8350709080696106, "rewards/margins": 0.0031533241271972656, "rewards/rejected": 0.8319175839424133, "step": 7214 }, { "epoch": 3.89, "learning_rate": 1.9254525188359681e-10, "logits/chosen": -2.141439199447632, "logits/rejected": -2.313791513442993, "logps/chosen": -1.0719469785690308, "logps/rejected": -3.4762096405029297, "loss": 0.5687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9483316540718079, "rewards/margins": 0.2666977047920227, "rewards/rejected": 0.6816339492797852, "step": 7215 }, { "epoch": 3.89, "learning_rate": 1.9063535993126823e-10, "logits/chosen": -2.068340539932251, "logits/rejected": -2.2927932739257812, "logps/chosen": -0.22987857460975647, "logps/rejected": -0.2516748011112213, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.7998409271240234, "rewards/margins": 0.01706486940383911, "rewards/rejected": 0.7827760577201843, "step": 7216 }, { "epoch": 3.89, "learning_rate": 1.887349694449647e-10, "logits/chosen": -2.11545991897583, "logits/rejected": -2.1189253330230713, "logps/chosen": -3.477804660797119, "logps/rejected": -0.8367401361465454, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.7302073240280151, "rewards/margins": 0.033569276332855225, "rewards/rejected": 0.6966380476951599, "step": 7217 }, { "epoch": 3.89, "learning_rate": 1.8684408078720737e-10, "logits/chosen": -2.1189584732055664, "logits/rejected": -2.309868097305298, "logps/chosen": -2.0937514305114746, "logps/rejected": -1.9785822629928589, "loss": 0.7074, "rewards/accuracies": 0.0, "rewards/chosen": 1.1133161783218384, "rewards/margins": -0.028268098831176758, "rewards/rejected": 1.1415842771530151, "step": 7218 }, { "epoch": 3.89, "learning_rate": 1.8496269431867996e-10, "logits/chosen": -2.1609508991241455, "logits/rejected": -2.3270561695098877, "logps/chosen": -5.84434175491333, "logps/rejected": -9.734131813049316, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 1.0839074850082397, "rewards/margins": 0.03774440288543701, "rewards/rejected": 1.0461630821228027, "step": 7219 }, { "epoch": 3.89, "learning_rate": 1.8309081039828423e-10, "logits/chosen": -2.151550531387329, "logits/rejected": -2.1487185955047607, "logps/chosen": -7.235013484954834, "logps/rejected": -3.7505617141723633, "loss": 0.316, "rewards/accuracies": 1.0, "rewards/chosen": 1.5536978244781494, "rewards/margins": 0.9897563457489014, "rewards/rejected": 0.563941478729248, "step": 7220 }, { "epoch": 3.89, "learning_rate": 1.8122842938308458e-10, "logits/chosen": -1.9992468357086182, "logits/rejected": -2.3202621936798096, "logps/chosen": -0.38031381368637085, "logps/rejected": -0.5163347721099854, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 0.8385739326477051, "rewards/margins": 0.053974032402038574, "rewards/rejected": 0.7845999002456665, "step": 7221 }, { "epoch": 3.9, "learning_rate": 1.7937555162835237e-10, "logits/chosen": -2.1561472415924072, "logits/rejected": -2.1490211486816406, "logps/chosen": -2.2167961597442627, "logps/rejected": -4.7113752365112305, "loss": 0.4637, "rewards/accuracies": 1.0, "rewards/chosen": 1.117706060409546, "rewards/margins": 0.5276787877082825, "rewards/rejected": 0.5900272727012634, "step": 7222 }, { "epoch": 3.9, "learning_rate": 1.7753217748752714e-10, "logits/chosen": -2.2152106761932373, "logits/rejected": -2.2145612239837646, "logps/chosen": -1.4439085721969604, "logps/rejected": -7.319680690765381, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 1.1277289390563965, "rewards/margins": 0.9122839570045471, "rewards/rejected": 0.21544499695301056, "step": 7223 }, { "epoch": 3.9, "learning_rate": 1.7569830731224978e-10, "logits/chosen": -2.3541338443756104, "logits/rejected": -2.22771954536438, "logps/chosen": -27.374553680419922, "logps/rejected": -2.8792498111724854, "loss": 0.174, "rewards/accuracies": 1.0, "rewards/chosen": 2.415621519088745, "rewards/margins": 1.6601932048797607, "rewards/rejected": 0.7554282546043396, "step": 7224 }, { "epoch": 3.9, "learning_rate": 1.7387394145234047e-10, "logits/chosen": -2.010753870010376, "logits/rejected": -2.245840549468994, "logps/chosen": -0.24643895030021667, "logps/rejected": -0.30372822284698486, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.8654202818870544, "rewards/margins": 0.013375282287597656, "rewards/rejected": 0.8520449995994568, "step": 7225 }, { "epoch": 3.9, "learning_rate": 1.7205908025580972e-10, "logits/chosen": -2.1802866458892822, "logits/rejected": -2.290060043334961, "logps/chosen": -2.996544599533081, "logps/rejected": -2.3069100379943848, "loss": 0.7144, "rewards/accuracies": 0.0, "rewards/chosen": 0.597413182258606, "rewards/margins": -0.04199796915054321, "rewards/rejected": 0.6394111514091492, "step": 7226 }, { "epoch": 3.9, "learning_rate": 1.7025372406885284e-10, "logits/chosen": -2.1132912635803223, "logits/rejected": -2.342747211456299, "logps/chosen": -0.8991180062294006, "logps/rejected": -0.8339006304740906, "loss": 0.6428, "rewards/accuracies": 1.0, "rewards/chosen": 1.2570394277572632, "rewards/margins": 0.10337257385253906, "rewards/rejected": 1.1536668539047241, "step": 7227 }, { "epoch": 3.9, "learning_rate": 1.6845787323585547e-10, "logits/chosen": -2.054264545440674, "logits/rejected": -2.3093955516815186, "logps/chosen": -0.2929147481918335, "logps/rejected": -4.909733772277832, "loss": 0.5552, "rewards/accuracies": 1.0, "rewards/chosen": 0.9488612413406372, "rewards/margins": 0.29792797565460205, "rewards/rejected": 0.6509332656860352, "step": 7228 }, { "epoch": 3.9, "learning_rate": 1.66671528099388e-10, "logits/chosen": -2.0825068950653076, "logits/rejected": -2.073197603225708, "logps/chosen": -3.60197114944458, "logps/rejected": -3.8817176818847656, "loss": 0.4536, "rewards/accuracies": 1.0, "rewards/chosen": 1.1876436471939087, "rewards/margins": 0.5552100539207458, "rewards/rejected": 0.6324335932731628, "step": 7229 }, { "epoch": 3.9, "learning_rate": 1.648946890002112e-10, "logits/chosen": -2.175849437713623, "logits/rejected": -2.3607332706451416, "logps/chosen": -0.5189405679702759, "logps/rejected": -2.688257932662964, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 1.0539029836654663, "rewards/margins": 0.007616519927978516, "rewards/rejected": 1.0462864637374878, "step": 7230 }, { "epoch": 3.9, "learning_rate": 1.631273562772595e-10, "logits/chosen": -2.0184686183929443, "logits/rejected": -2.019331455230713, "logps/chosen": -1.7282633781433105, "logps/rejected": -6.779316425323486, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 1.1763794422149658, "rewards/margins": 0.8006483912467957, "rewards/rejected": 0.37573105096817017, "step": 7231 }, { "epoch": 3.9, "learning_rate": 1.6136953026766875e-10, "logits/chosen": -2.142791271209717, "logits/rejected": -2.123628854751587, "logps/chosen": -1.232332468032837, "logps/rejected": -9.020566940307617, "loss": 0.419, "rewards/accuracies": 1.0, "rewards/chosen": 1.2463223934173584, "rewards/margins": 0.6530917286872864, "rewards/rejected": 0.593230664730072, "step": 7232 }, { "epoch": 3.9, "learning_rate": 1.5962121130675964e-10, "logits/chosen": -2.0053234100341797, "logits/rejected": -1.9823518991470337, "logps/chosen": -13.854022026062012, "logps/rejected": -2.4806957244873047, "loss": 0.22, "rewards/accuracies": 1.0, "rewards/chosen": 2.0895047187805176, "rewards/margins": 1.4021857976913452, "rewards/rejected": 0.6873189210891724, "step": 7233 }, { "epoch": 3.9, "learning_rate": 1.5788239972803208e-10, "logits/chosen": -2.050947427749634, "logits/rejected": -2.3140218257904053, "logps/chosen": -0.15654751658439636, "logps/rejected": -0.18351638317108154, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 1.04595148563385, "rewards/margins": 0.005478262901306152, "rewards/rejected": 1.040473222732544, "step": 7234 }, { "epoch": 3.9, "learning_rate": 1.5615309586317626e-10, "logits/chosen": -2.0732502937316895, "logits/rejected": -2.0850467681884766, "logps/chosen": -5.504220008850098, "logps/rejected": -9.39848804473877, "loss": 0.2805, "rewards/accuracies": 1.0, "rewards/chosen": 1.6962658166885376, "rewards/margins": 1.1278029680252075, "rewards/rejected": 0.5684628486633301, "step": 7235 }, { "epoch": 3.9, "learning_rate": 1.5443330004206722e-10, "logits/chosen": -2.2490453720092773, "logits/rejected": -2.2514240741729736, "logps/chosen": -0.3545616865158081, "logps/rejected": -4.98699951171875, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 0.7413319945335388, "rewards/margins": 0.5634570717811584, "rewards/rejected": 0.17787490785121918, "step": 7236 }, { "epoch": 3.9, "learning_rate": 1.5272301259276476e-10, "logits/chosen": -2.1063923835754395, "logits/rejected": -2.28293514251709, "logps/chosen": -0.16348321735858917, "logps/rejected": -0.20839281380176544, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.8968796730041504, "rewards/margins": 0.04284465312957764, "rewards/rejected": 0.8540350198745728, "step": 7237 }, { "epoch": 3.9, "learning_rate": 1.5102223384152457e-10, "logits/chosen": -2.1011359691619873, "logits/rejected": -2.3049633502960205, "logps/chosen": -1.4181829690933228, "logps/rejected": -1.2915643453598022, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 1.1174050569534302, "rewards/margins": -0.00202023983001709, "rewards/rejected": 1.1194252967834473, "step": 7238 }, { "epoch": 3.9, "learning_rate": 1.4933096411277602e-10, "logits/chosen": -2.147308349609375, "logits/rejected": -2.2394912242889404, "logps/chosen": -0.26060181856155396, "logps/rejected": -0.3514898419380188, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9690033793449402, "rewards/margins": 0.01842665672302246, "rewards/rejected": 0.9505767226219177, "step": 7239 }, { "epoch": 3.91, "learning_rate": 1.4764920372914434e-10, "logits/chosen": -2.12658429145813, "logits/rejected": -2.2477588653564453, "logps/chosen": -4.061788082122803, "logps/rejected": -2.830409049987793, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.6855610609054565, "rewards/margins": 0.028935670852661133, "rewards/rejected": 0.6566253900527954, "step": 7240 }, { "epoch": 3.91, "learning_rate": 1.4597695301143408e-10, "logits/chosen": -2.061070680618286, "logits/rejected": -2.3048593997955322, "logps/chosen": -1.1815822124481201, "logps/rejected": -1.6891579627990723, "loss": 0.7057, "rewards/accuracies": 0.0, "rewards/chosen": 0.8969033360481262, "rewards/margins": -0.02493107318878174, "rewards/rejected": 0.921834409236908, "step": 7241 }, { "epoch": 3.91, "learning_rate": 1.443142122786345e-10, "logits/chosen": -2.0818917751312256, "logits/rejected": -2.0771892070770264, "logps/chosen": -4.724950790405273, "logps/rejected": -4.566576957702637, "loss": 0.4195, "rewards/accuracies": 1.0, "rewards/chosen": 1.0531201362609863, "rewards/margins": 0.6515620946884155, "rewards/rejected": 0.4015580117702484, "step": 7242 }, { "epoch": 3.91, "learning_rate": 1.4266098184792518e-10, "logits/chosen": -2.1923434734344482, "logits/rejected": -2.1925411224365234, "logps/chosen": -3.52699613571167, "logps/rejected": -3.5017054080963135, "loss": 0.3128, "rewards/accuracies": 1.0, "rewards/chosen": 1.5614604949951172, "rewards/margins": 1.0018538236618042, "rewards/rejected": 0.559606671333313, "step": 7243 }, { "epoch": 3.91, "learning_rate": 1.4101726203468167e-10, "logits/chosen": -2.0578291416168213, "logits/rejected": -2.334872245788574, "logps/chosen": -0.5777966380119324, "logps/rejected": -0.5293920040130615, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0990153551101685, "rewards/margins": 0.01858997344970703, "rewards/rejected": 1.0804253816604614, "step": 7244 }, { "epoch": 3.91, "learning_rate": 1.3938305315244203e-10, "logits/chosen": -2.313389778137207, "logits/rejected": -2.157963752746582, "logps/chosen": -26.16931915283203, "logps/rejected": -4.083795070648193, "loss": 0.139, "rewards/accuracies": 1.0, "rewards/chosen": 2.274418592453003, "rewards/margins": 1.9029232263565063, "rewards/rejected": 0.37149539589881897, "step": 7245 }, { "epoch": 3.91, "learning_rate": 1.377583555129458e-10, "logits/chosen": -2.0052826404571533, "logits/rejected": -2.385843276977539, "logps/chosen": -2.80058217048645, "logps/rejected": -2.9924895763397217, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 1.1304223537445068, "rewards/margins": 0.016651034355163574, "rewards/rejected": 1.1137713193893433, "step": 7246 }, { "epoch": 3.91, "learning_rate": 1.361431694261117e-10, "logits/chosen": -1.9856699705123901, "logits/rejected": -2.0025880336761475, "logps/chosen": -4.342829704284668, "logps/rejected": -7.4748334884643555, "loss": 0.5051, "rewards/accuracies": 1.0, "rewards/chosen": 1.1185214519500732, "rewards/margins": 0.4198191165924072, "rewards/rejected": 0.698702335357666, "step": 7247 }, { "epoch": 3.91, "learning_rate": 1.3453749520005998e-10, "logits/chosen": -2.0416147708892822, "logits/rejected": -2.314065933227539, "logps/chosen": -8.265783309936523, "logps/rejected": -8.551910400390625, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.5178969502449036, "rewards/margins": 0.04055720567703247, "rewards/rejected": 0.4773397445678711, "step": 7248 }, { "epoch": 3.91, "learning_rate": 1.3294133314106782e-10, "logits/chosen": -2.0705795288085938, "logits/rejected": -2.0782995223999023, "logps/chosen": -0.8382536768913269, "logps/rejected": -4.656344413757324, "loss": 0.4348, "rewards/accuracies": 1.0, "rewards/chosen": 0.8915160298347473, "rewards/margins": 0.6075016260147095, "rewards/rejected": 0.28401437401771545, "step": 7249 }, { "epoch": 3.91, "learning_rate": 1.3135468355361946e-10, "logits/chosen": -2.1317880153656006, "logits/rejected": -2.244338274002075, "logps/chosen": -7.341688632965088, "logps/rejected": -20.54834747314453, "loss": 0.2366, "rewards/accuracies": 1.0, "rewards/chosen": 2.054764747619629, "rewards/margins": 1.320554494857788, "rewards/rejected": 0.734210193157196, "step": 7250 }, { "epoch": 3.91, "learning_rate": 1.2977754674038389e-10, "logits/chosen": -2.22037410736084, "logits/rejected": -2.229241371154785, "logps/chosen": -0.9587023854255676, "logps/rejected": -3.1187360286712646, "loss": 0.4221, "rewards/accuracies": 1.0, "rewards/chosen": 1.1831703186035156, "rewards/margins": 0.6440011858940125, "rewards/rejected": 0.5391691327095032, "step": 7251 }, { "epoch": 3.91, "learning_rate": 1.282099230022038e-10, "logits/chosen": -2.050478458404541, "logits/rejected": -2.0442745685577393, "logps/chosen": -0.4914671778678894, "logps/rejected": -5.49862003326416, "loss": 0.4357, "rewards/accuracies": 1.0, "rewards/chosen": 1.0881495475769043, "rewards/margins": 0.6050271987915039, "rewards/rejected": 0.4831223487854004, "step": 7252 }, { "epoch": 3.91, "learning_rate": 1.266518126381233e-10, "logits/chosen": -2.004406690597534, "logits/rejected": -1.9813841581344604, "logps/chosen": -11.031868934631348, "logps/rejected": -7.246694564819336, "loss": 0.3114, "rewards/accuracies": 1.0, "rewards/chosen": 1.701212763786316, "rewards/margins": 1.0067925453186035, "rewards/rejected": 0.6944202780723572, "step": 7253 }, { "epoch": 3.91, "learning_rate": 1.2510321594534912e-10, "logits/chosen": -2.009596109390259, "logits/rejected": -2.286306381225586, "logps/chosen": -1.5626343488693237, "logps/rejected": -1.4630450010299683, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.6798969507217407, "rewards/margins": 0.01244974136352539, "rewards/rejected": 0.6674472093582153, "step": 7254 }, { "epoch": 3.91, "learning_rate": 1.235641332192894e-10, "logits/chosen": -2.1389455795288086, "logits/rejected": -2.1697134971618652, "logps/chosen": -9.45252513885498, "logps/rejected": -13.6921968460083, "loss": 0.6494, "rewards/accuracies": 1.0, "rewards/chosen": 1.2102363109588623, "rewards/margins": 0.08949077129364014, "rewards/rejected": 1.1207455396652222, "step": 7255 }, { "epoch": 3.91, "learning_rate": 1.2203456475354258e-10, "logits/chosen": -2.0415492057800293, "logits/rejected": -2.0477142333984375, "logps/chosen": -2.151141881942749, "logps/rejected": -4.747790336608887, "loss": 0.4383, "rewards/accuracies": 1.0, "rewards/chosen": 1.094146966934204, "rewards/margins": 0.5976507663726807, "rewards/rejected": 0.49649620056152344, "step": 7256 }, { "epoch": 3.91, "learning_rate": 1.2051451083988084e-10, "logits/chosen": -1.9558955430984497, "logits/rejected": -1.9549493789672852, "logps/chosen": -0.6208292245864868, "logps/rejected": -1.7978813648223877, "loss": 0.5914, "rewards/accuracies": 1.0, "rewards/chosen": 0.9697750210762024, "rewards/margins": 0.2150489091873169, "rewards/rejected": 0.7547261118888855, "step": 7257 }, { "epoch": 3.91, "learning_rate": 1.190039717682556e-10, "logits/chosen": -2.037137269973755, "logits/rejected": -2.0294623374938965, "logps/chosen": -4.070893287658691, "logps/rejected": -3.68800687789917, "loss": 0.246, "rewards/accuracies": 1.0, "rewards/chosen": 1.657293677330017, "rewards/margins": 1.2767988443374634, "rewards/rejected": 0.3804948031902313, "step": 7258 }, { "epoch": 3.92, "learning_rate": 1.1750294782682524e-10, "logits/chosen": -2.0403528213500977, "logits/rejected": -2.037442922592163, "logps/chosen": -0.572534441947937, "logps/rejected": -3.1526834964752197, "loss": 0.5936, "rewards/accuracies": 1.0, "rewards/chosen": 0.9363462328910828, "rewards/margins": 0.2101982831954956, "rewards/rejected": 0.7261479496955872, "step": 7259 }, { "epoch": 3.92, "learning_rate": 1.160114393019107e-10, "logits/chosen": -1.9563325643539429, "logits/rejected": -1.9545308351516724, "logps/chosen": -1.105468988418579, "logps/rejected": -1.3585879802703857, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.9756860136985779, "rewards/margins": 0.010825276374816895, "rewards/rejected": 0.964860737323761, "step": 7260 }, { "epoch": 3.92, "learning_rate": 1.1452944647802887e-10, "logits/chosen": -1.9489773511886597, "logits/rejected": -2.2804484367370605, "logps/chosen": -0.2030888795852661, "logps/rejected": -0.20336182415485382, "loss": 0.6986, "rewards/accuracies": 0.0, "rewards/chosen": 0.9847709536552429, "rewards/margins": -0.01097416877746582, "rewards/rejected": 0.9957451224327087, "step": 7261 }, { "epoch": 3.92, "learning_rate": 1.1305696963788692e-10, "logits/chosen": -2.1486852169036865, "logits/rejected": -2.1520044803619385, "logps/chosen": -0.794818103313446, "logps/rejected": -2.79158878326416, "loss": 0.5435, "rewards/accuracies": 1.0, "rewards/chosen": 1.0787252187728882, "rewards/margins": 0.32575535774230957, "rewards/rejected": 0.7529698610305786, "step": 7262 }, { "epoch": 3.92, "learning_rate": 1.1159400906236016e-10, "logits/chosen": -2.0143682956695557, "logits/rejected": -2.255084753036499, "logps/chosen": -1.5633907318115234, "logps/rejected": -1.2439216375350952, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": 0.7864565253257751, "rewards/margins": -0.018723368644714355, "rewards/rejected": 0.8051798939704895, "step": 7263 }, { "epoch": 3.92, "learning_rate": 1.1014056503051983e-10, "logits/chosen": -2.0909643173217773, "logits/rejected": -2.3489022254943848, "logps/chosen": -0.18516969680786133, "logps/rejected": -0.22615590691566467, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9139906167984009, "rewards/margins": 0.026201367378234863, "rewards/rejected": 0.887789249420166, "step": 7264 }, { "epoch": 3.92, "learning_rate": 1.086966378196219e-10, "logits/chosen": -2.158384323120117, "logits/rejected": -2.1576850414276123, "logps/chosen": -6.49564790725708, "logps/rejected": -2.3377914428710938, "loss": 0.4481, "rewards/accuracies": 1.0, "rewards/chosen": 1.304180383682251, "rewards/margins": 0.5702536702156067, "rewards/rejected": 0.7339267134666443, "step": 7265 }, { "epoch": 3.92, "learning_rate": 1.0726222770510718e-10, "logits/chosen": -2.0276038646698, "logits/rejected": -2.0357303619384766, "logps/chosen": -1.1838648319244385, "logps/rejected": -4.287482261657715, "loss": 0.4527, "rewards/accuracies": 1.0, "rewards/chosen": 1.0032187700271606, "rewards/margins": 0.5575541257858276, "rewards/rejected": 0.4456646144390106, "step": 7266 }, { "epoch": 3.92, "learning_rate": 1.0583733496059566e-10, "logits/chosen": -2.0699453353881836, "logits/rejected": -2.2563023567199707, "logps/chosen": -0.4496108889579773, "logps/rejected": -1.5059185028076172, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.94677734375, "rewards/margins": 0.042526066303253174, "rewards/rejected": 0.9042512774467468, "step": 7267 }, { "epoch": 3.92, "learning_rate": 1.044219598578977e-10, "logits/chosen": -2.0462682247161865, "logits/rejected": -2.288051128387451, "logps/chosen": -0.2105698585510254, "logps/rejected": -0.21491768956184387, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.9665991067886353, "rewards/margins": 0.0008680224418640137, "rewards/rejected": 0.9657310843467712, "step": 7268 }, { "epoch": 3.92, "learning_rate": 1.0301610266700289e-10, "logits/chosen": -2.0723965167999268, "logits/rejected": -2.2813403606414795, "logps/chosen": -0.2156037986278534, "logps/rejected": -0.17822018265724182, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.8498856425285339, "rewards/margins": 0.020040273666381836, "rewards/rejected": 0.8298453688621521, "step": 7269 }, { "epoch": 3.92, "learning_rate": 1.0161976365609116e-10, "logits/chosen": -2.0379178524017334, "logits/rejected": -2.2484652996063232, "logps/chosen": -0.16871201992034912, "logps/rejected": -0.231028750538826, "loss": 0.7031, "rewards/accuracies": 0.0, "rewards/chosen": 0.8938485383987427, "rewards/margins": -0.01980990171432495, "rewards/rejected": 0.9136584401130676, "step": 7270 }, { "epoch": 3.92, "learning_rate": 1.0023294309151608e-10, "logits/chosen": -1.992720603942871, "logits/rejected": -1.9942338466644287, "logps/chosen": -1.7175287008285522, "logps/rejected": -3.5574722290039062, "loss": 0.4896, "rewards/accuracies": 1.0, "rewards/chosen": 0.9764825105667114, "rewards/margins": 0.4593856930732727, "rewards/rejected": 0.5170968174934387, "step": 7271 }, { "epoch": 3.92, "learning_rate": 9.885564123783275e-11, "logits/chosen": -2.3165066242218018, "logits/rejected": -2.025743007659912, "logps/chosen": -62.442604064941406, "logps/rejected": -13.269637107849121, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 3.7308099269866943, "rewards/margins": 3.043614149093628, "rewards/rejected": 0.6871957182884216, "step": 7272 }, { "epoch": 3.92, "learning_rate": 9.74878583577643e-11, "logits/chosen": -2.0183207988739014, "logits/rejected": -2.0198073387145996, "logps/chosen": -0.5764301419258118, "logps/rejected": -7.836826801300049, "loss": 0.4146, "rewards/accuracies": 1.0, "rewards/chosen": 1.1120316982269287, "rewards/margins": 0.6661084890365601, "rewards/rejected": 0.44592317938804626, "step": 7273 }, { "epoch": 3.92, "learning_rate": 9.612959471222427e-11, "logits/chosen": -2.0224995613098145, "logits/rejected": -2.3019440174102783, "logps/chosen": -0.5946155786514282, "logps/rejected": -0.5583685636520386, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.8956573605537415, "rewards/margins": 0.026670098304748535, "rewards/rejected": 0.8689872622489929, "step": 7274 }, { "epoch": 3.92, "learning_rate": 9.478085056031093e-11, "logits/chosen": -2.1361141204833984, "logits/rejected": -2.1274116039276123, "logps/chosen": -4.579503536224365, "logps/rejected": -7.054091930389404, "loss": 0.3027, "rewards/accuracies": 1.0, "rewards/chosen": 1.5111963748931885, "rewards/margins": 1.0399293899536133, "rewards/rejected": 0.4712669849395752, "step": 7275 }, { "epoch": 3.92, "learning_rate": 9.34416261593074e-11, "logits/chosen": -2.1678013801574707, "logits/rejected": -2.1619129180908203, "logps/chosen": -2.045415163040161, "logps/rejected": -2.930875778198242, "loss": 0.5063, "rewards/accuracies": 1.0, "rewards/chosen": 1.1826800107955933, "rewards/margins": 0.41678309440612793, "rewards/rejected": 0.7658969163894653, "step": 7276 }, { "epoch": 3.93, "learning_rate": 9.211192176468152e-11, "logits/chosen": -2.094910144805908, "logits/rejected": -2.2563400268554688, "logps/chosen": -2.211418867111206, "logps/rejected": -2.5199835300445557, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8979496955871582, "rewards/margins": 0.020954430103302002, "rewards/rejected": 0.8769952654838562, "step": 7277 }, { "epoch": 3.93, "learning_rate": 9.079173763007486e-11, "logits/chosen": -2.0500805377960205, "logits/rejected": -2.0516395568847656, "logps/chosen": -1.7240993976593018, "logps/rejected": -1.5118049383163452, "loss": 0.3998, "rewards/accuracies": 1.0, "rewards/chosen": 1.6064256429672241, "rewards/margins": 0.7102741003036499, "rewards/rejected": 0.8961515426635742, "step": 7278 }, { "epoch": 3.93, "learning_rate": 8.948107400733595e-11, "logits/chosen": -2.0823893547058105, "logits/rejected": -2.088968515396118, "logps/chosen": -2.5424392223358154, "logps/rejected": -5.977572917938232, "loss": 0.4396, "rewards/accuracies": 1.0, "rewards/chosen": 0.961323082447052, "rewards/margins": 0.5940986275672913, "rewards/rejected": 0.36722445487976074, "step": 7279 }, { "epoch": 3.93, "learning_rate": 8.817993114647038e-11, "logits/chosen": -2.167202949523926, "logits/rejected": -2.334726333618164, "logps/chosen": -8.431442260742188, "logps/rejected": -8.17011547088623, "loss": 0.7069, "rewards/accuracies": 0.0, "rewards/chosen": 0.9922008514404297, "rewards/margins": -0.027253270149230957, "rewards/rejected": 1.0194541215896606, "step": 7280 }, { "epoch": 3.93, "learning_rate": 8.688830929567958e-11, "logits/chosen": -2.069977283477783, "logits/rejected": -2.074904441833496, "logps/chosen": -0.7864342927932739, "logps/rejected": -7.117929458618164, "loss": 0.3581, "rewards/accuracies": 1.0, "rewards/chosen": 1.0672962665557861, "rewards/margins": 0.8423923254013062, "rewards/rejected": 0.22490397095680237, "step": 7281 }, { "epoch": 3.93, "learning_rate": 8.560620870136093e-11, "logits/chosen": -2.1985645294189453, "logits/rejected": -2.2078323364257812, "logps/chosen": -1.2066552639007568, "logps/rejected": -3.944270372390747, "loss": 0.4264, "rewards/accuracies": 1.0, "rewards/chosen": 1.0569599866867065, "rewards/margins": 0.6315038204193115, "rewards/rejected": 0.42545613646507263, "step": 7282 }, { "epoch": 3.93, "learning_rate": 8.433362960806878e-11, "logits/chosen": -2.0721542835235596, "logits/rejected": -2.0726876258850098, "logps/chosen": -0.11704497039318085, "logps/rejected": -7.00570011138916, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": 0.9398507475852966, "rewards/margins": 0.6926388144493103, "rewards/rejected": 0.24721193313598633, "step": 7283 }, { "epoch": 3.93, "learning_rate": 8.307057225856451e-11, "logits/chosen": -2.196915626525879, "logits/rejected": -2.211897611618042, "logps/chosen": -1.173609972000122, "logps/rejected": -1.1465578079223633, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 1.0656341314315796, "rewards/margins": 0.013096809387207031, "rewards/rejected": 1.0525373220443726, "step": 7284 }, { "epoch": 3.93, "learning_rate": 8.181703689378317e-11, "logits/chosen": -2.0707781314849854, "logits/rejected": -2.3122775554656982, "logps/chosen": -0.24742268025875092, "logps/rejected": -0.226255863904953, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8188373446464539, "rewards/margins": 0.01837337017059326, "rewards/rejected": 0.8004639744758606, "step": 7285 }, { "epoch": 3.93, "learning_rate": 8.057302375284459e-11, "logits/chosen": -2.1314244270324707, "logits/rejected": -2.1349434852600098, "logps/chosen": -2.399632453918457, "logps/rejected": -1.3122516870498657, "loss": 0.5677, "rewards/accuracies": 1.0, "rewards/chosen": 1.4540879726409912, "rewards/margins": 0.2689591646194458, "rewards/rejected": 1.1851288080215454, "step": 7286 }, { "epoch": 3.93, "learning_rate": 7.933853307305338e-11, "logits/chosen": -2.0731427669525146, "logits/rejected": -2.282627820968628, "logps/chosen": -1.75319242477417, "logps/rejected": -3.9007043838500977, "loss": 0.6669, "rewards/accuracies": 1.0, "rewards/chosen": 1.044685959815979, "rewards/margins": 0.05323880910873413, "rewards/rejected": 0.9914471507072449, "step": 7287 }, { "epoch": 3.93, "learning_rate": 7.811356508989341e-11, "logits/chosen": -2.0075650215148926, "logits/rejected": -2.0129294395446777, "logps/chosen": -1.6603254079818726, "logps/rejected": -5.064941883087158, "loss": 0.4293, "rewards/accuracies": 1.0, "rewards/chosen": 0.9873211979866028, "rewards/margins": 0.6234028339385986, "rewards/rejected": 0.36391836404800415, "step": 7288 }, { "epoch": 3.93, "learning_rate": 7.689812003703889e-11, "logits/chosen": -2.1010329723358154, "logits/rejected": -2.230994462966919, "logps/chosen": -1.118515133857727, "logps/rejected": -1.153288722038269, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": 0.7152210474014282, "rewards/margins": 0.0369076132774353, "rewards/rejected": 0.6783134341239929, "step": 7289 }, { "epoch": 3.93, "learning_rate": 7.569219814634875e-11, "logits/chosen": -2.097468614578247, "logits/rejected": -2.3252768516540527, "logps/chosen": -1.4848979711532593, "logps/rejected": -1.502758502960205, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8686837553977966, "rewards/margins": 0.021289169788360596, "rewards/rejected": 0.847394585609436, "step": 7290 }, { "epoch": 3.93, "learning_rate": 7.449579964785013e-11, "logits/chosen": -2.0665371417999268, "logits/rejected": -2.24716854095459, "logps/chosen": -1.4073272943496704, "logps/rejected": -1.2477765083312988, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.8557478189468384, "rewards/margins": 0.0026838183403015137, "rewards/rejected": 0.8530640006065369, "step": 7291 }, { "epoch": 3.93, "learning_rate": 7.330892476976602e-11, "logits/chosen": -2.1127920150756836, "logits/rejected": -2.1144754886627197, "logps/chosen": -4.1982035636901855, "logps/rejected": -2.8354973793029785, "loss": 0.2295, "rewards/accuracies": 1.0, "rewards/chosen": 1.9106063842773438, "rewards/margins": 1.3546867370605469, "rewards/rejected": 0.5559197068214417, "step": 7292 }, { "epoch": 3.93, "learning_rate": 7.21315737385042e-11, "logits/chosen": -2.0831780433654785, "logits/rejected": -2.0762696266174316, "logps/chosen": -9.220311164855957, "logps/rejected": -1.854364275932312, "loss": 0.6466, "rewards/accuracies": 1.0, "rewards/chosen": 1.180856466293335, "rewards/margins": 0.09538841247558594, "rewards/rejected": 1.085468053817749, "step": 7293 }, { "epoch": 3.93, "learning_rate": 7.096374677865724e-11, "logits/chosen": -2.0717084407806396, "logits/rejected": -2.0719563961029053, "logps/chosen": -0.5668664574623108, "logps/rejected": -7.931092739105225, "loss": 0.3721, "rewards/accuracies": 1.0, "rewards/chosen": 1.1337183713912964, "rewards/margins": 0.7966796159744263, "rewards/rejected": 0.3370387554168701, "step": 7294 }, { "epoch": 3.93, "learning_rate": 6.980544411298584e-11, "logits/chosen": -2.1721694469451904, "logits/rejected": -2.356015920639038, "logps/chosen": -0.15543074905872345, "logps/rejected": -0.16820760071277618, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9535459876060486, "rewards/margins": 0.014372289180755615, "rewards/rejected": 0.939173698425293, "step": 7295 }, { "epoch": 3.94, "learning_rate": 6.865666596244657e-11, "logits/chosen": -2.0742604732513428, "logits/rejected": -2.079862356185913, "logps/chosen": -3.8803870677948, "logps/rejected": -14.075716972351074, "loss": 0.3693, "rewards/accuracies": 1.0, "rewards/chosen": 0.9739848971366882, "rewards/margins": 0.805903434753418, "rewards/rejected": 0.16808147728443146, "step": 7296 }, { "epoch": 3.94, "learning_rate": 6.751741254618081e-11, "logits/chosen": -2.1426126956939697, "logits/rejected": -2.1460654735565186, "logps/chosen": -0.5336620807647705, "logps/rejected": -4.09536075592041, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 1.1477605104446411, "rewards/margins": 0.4805431365966797, "rewards/rejected": 0.6672173738479614, "step": 7297 }, { "epoch": 3.94, "learning_rate": 6.638768408150364e-11, "logits/chosen": -1.947566032409668, "logits/rejected": -1.929101586341858, "logps/chosen": -7.3527445793151855, "logps/rejected": -0.8170971870422363, "loss": 0.551, "rewards/accuracies": 1.0, "rewards/chosen": 1.4084285497665405, "rewards/margins": 0.30795419216156006, "rewards/rejected": 1.1004743576049805, "step": 7298 }, { "epoch": 3.94, "learning_rate": 6.526748078392041e-11, "logits/chosen": -1.9961355924606323, "logits/rejected": -1.9981470108032227, "logps/chosen": -0.16039039194583893, "logps/rejected": -3.116551160812378, "loss": 0.4951, "rewards/accuracies": 1.0, "rewards/chosen": 0.9165277481079102, "rewards/margins": 0.44529008865356445, "rewards/rejected": 0.4712376594543457, "step": 7299 }, { "epoch": 3.94, "learning_rate": 6.415680286712133e-11, "logits/chosen": -2.155980110168457, "logits/rejected": -2.14789080619812, "logps/chosen": -2.6820929050445557, "logps/rejected": -3.2762184143066406, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 1.220036506652832, "rewards/margins": 0.732494592666626, "rewards/rejected": 0.48754188418388367, "step": 7300 }, { "epoch": 3.94, "learning_rate": 6.305565054296468e-11, "logits/chosen": -2.026719093322754, "logits/rejected": -2.0293047428131104, "logps/chosen": -4.497937202453613, "logps/rejected": -6.689647674560547, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": 1.302993655204773, "rewards/margins": 0.75838303565979, "rewards/rejected": 0.5446106195449829, "step": 7301 }, { "epoch": 3.94, "learning_rate": 6.196402402151579e-11, "logits/chosen": -1.9845855236053467, "logits/rejected": -2.2989323139190674, "logps/chosen": -1.1028258800506592, "logps/rejected": -4.260116100311279, "loss": 0.6314, "rewards/accuracies": 1.0, "rewards/chosen": 1.0608093738555908, "rewards/margins": 0.1276400089263916, "rewards/rejected": 0.9331693649291992, "step": 7302 }, { "epoch": 3.94, "learning_rate": 6.088192351100253e-11, "logits/chosen": -2.242940902709961, "logits/rejected": -2.209482192993164, "logps/chosen": -22.502796173095703, "logps/rejected": -11.012195587158203, "loss": 0.3329, "rewards/accuracies": 1.0, "rewards/chosen": 2.043522357940674, "rewards/margins": 0.9288331270217896, "rewards/rejected": 1.1146892309188843, "step": 7303 }, { "epoch": 3.94, "learning_rate": 5.980934921783754e-11, "logits/chosen": -2.189340114593506, "logits/rejected": -2.3180055618286133, "logps/chosen": -0.955468475818634, "logps/rejected": -0.9223248958587646, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 1.0860041379928589, "rewards/margins": 0.01385033130645752, "rewards/rejected": 1.0721538066864014, "step": 7304 }, { "epoch": 3.94, "learning_rate": 5.874630134663494e-11, "logits/chosen": -2.081768274307251, "logits/rejected": -2.335954427719116, "logps/chosen": -0.2706032693386078, "logps/rejected": -0.280261754989624, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 1.0801539421081543, "rewards/margins": -0.008535146713256836, "rewards/rejected": 1.0886890888214111, "step": 7305 }, { "epoch": 3.94, "learning_rate": 5.769278010016587e-11, "logits/chosen": -2.1689512729644775, "logits/rejected": -2.3072657585144043, "logps/chosen": -0.27234935760498047, "logps/rejected": -0.26551204919815063, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 1.0754942893981934, "rewards/margins": -0.0021697282791137695, "rewards/rejected": 1.0776640176773071, "step": 7306 }, { "epoch": 3.94, "learning_rate": 5.66487856794029e-11, "logits/chosen": -2.029355764389038, "logits/rejected": -2.0407471656799316, "logps/chosen": -1.714125633239746, "logps/rejected": -2.5420517921447754, "loss": 0.3954, "rewards/accuracies": 1.0, "rewards/chosen": 1.4416089057922363, "rewards/margins": 0.7237688302993774, "rewards/rejected": 0.7178400754928589, "step": 7307 }, { "epoch": 3.94, "learning_rate": 5.561431828349228e-11, "logits/chosen": -2.168586015701294, "logits/rejected": -2.164577007293701, "logps/chosen": -2.7214343547821045, "logps/rejected": -7.633757591247559, "loss": 0.3519, "rewards/accuracies": 1.0, "rewards/chosen": 1.5652918815612793, "rewards/margins": 0.8633769750595093, "rewards/rejected": 0.70191490650177, "step": 7308 }, { "epoch": 3.94, "learning_rate": 5.45893781097706e-11, "logits/chosen": -2.1137330532073975, "logits/rejected": -2.3340184688568115, "logps/chosen": -0.960113525390625, "logps/rejected": -6.51945161819458, "loss": 0.5444, "rewards/accuracies": 1.0, "rewards/chosen": 1.0315340757369995, "rewards/margins": 0.32357150316238403, "rewards/rejected": 0.7079625725746155, "step": 7309 }, { "epoch": 3.94, "learning_rate": 5.357396535374259e-11, "logits/chosen": -2.0904996395111084, "logits/rejected": -2.0927650928497314, "logps/chosen": -2.0455353260040283, "logps/rejected": -2.885524272918701, "loss": 0.588, "rewards/accuracies": 1.0, "rewards/chosen": 1.0620579719543457, "rewards/margins": 0.22264796495437622, "rewards/rejected": 0.8394100069999695, "step": 7310 }, { "epoch": 3.94, "learning_rate": 5.256808020911441e-11, "logits/chosen": -2.0292301177978516, "logits/rejected": -2.0334725379943848, "logps/chosen": -2.5855417251586914, "logps/rejected": -0.6600395441055298, "loss": 0.6229, "rewards/accuracies": 1.0, "rewards/chosen": 1.1593092679977417, "rewards/margins": 0.14580464363098145, "rewards/rejected": 1.0135046243667603, "step": 7311 }, { "epoch": 3.94, "learning_rate": 5.15717228677548e-11, "logits/chosen": -2.108238935470581, "logits/rejected": -2.0977537631988525, "logps/chosen": -1.2716621160507202, "logps/rejected": -12.90495491027832, "loss": 0.2806, "rewards/accuracies": 1.0, "rewards/chosen": 1.4444763660430908, "rewards/margins": 1.1272512674331665, "rewards/rejected": 0.31722506880760193, "step": 7312 }, { "epoch": 3.94, "learning_rate": 5.058489351973949e-11, "logits/chosen": -2.087484121322632, "logits/rejected": -2.2586121559143066, "logps/chosen": -7.31233549118042, "logps/rejected": -1.0435292720794678, "loss": 0.9049, "rewards/accuracies": 0.0, "rewards/chosen": 0.7237264513969421, "rewards/margins": -0.3864261507987976, "rewards/rejected": 1.1101526021957397, "step": 7313 }, { "epoch": 3.94, "learning_rate": 4.960759235330125e-11, "logits/chosen": -2.011624574661255, "logits/rejected": -2.334974527359009, "logps/chosen": -0.26093703508377075, "logps/rejected": -0.28351345658302307, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8799664378166199, "rewards/margins": 0.012534499168395996, "rewards/rejected": 0.8674319386482239, "step": 7314 }, { "epoch": 3.95, "learning_rate": 4.8639819554868734e-11, "logits/chosen": -2.264392375946045, "logits/rejected": -2.1484875679016113, "logps/chosen": -25.527271270751953, "logps/rejected": -7.989795684814453, "loss": 0.2043, "rewards/accuracies": 1.0, "rewards/chosen": 1.6652764081954956, "rewards/margins": 1.4845162630081177, "rewards/rejected": 0.18076010048389435, "step": 7315 }, { "epoch": 3.95, "learning_rate": 4.7681575309055364e-11, "logits/chosen": -2.231621265411377, "logits/rejected": -2.3367679119110107, "logps/chosen": -0.2087944895029068, "logps/rejected": -0.21538029611110687, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 1.0024092197418213, "rewards/margins": 0.008633434772491455, "rewards/rejected": 0.9937757849693298, "step": 7316 }, { "epoch": 3.95, "learning_rate": 4.673285979865382e-11, "logits/chosen": -2.157224416732788, "logits/rejected": -2.3419439792633057, "logps/chosen": -1.3353842496871948, "logps/rejected": -6.654080867767334, "loss": 0.6453, "rewards/accuracies": 1.0, "rewards/chosen": 1.2497986555099487, "rewards/margins": 0.09810209274291992, "rewards/rejected": 1.1516965627670288, "step": 7317 }, { "epoch": 3.95, "learning_rate": 4.579367320462491e-11, "logits/chosen": -2.1357152462005615, "logits/rejected": -2.1443967819213867, "logps/chosen": -1.4424118995666504, "logps/rejected": -5.680330753326416, "loss": 0.4342, "rewards/accuracies": 1.0, "rewards/chosen": 1.0612133741378784, "rewards/margins": 0.6094191074371338, "rewards/rejected": 0.451794296503067, "step": 7318 }, { "epoch": 3.95, "learning_rate": 4.4864015706141954e-11, "logits/chosen": -2.2502894401550293, "logits/rejected": -2.454202890396118, "logps/chosen": -6.883122444152832, "logps/rejected": -7.202411651611328, "loss": 0.6693, "rewards/accuracies": 1.0, "rewards/chosen": 1.4480499029159546, "rewards/margins": 0.04837167263031006, "rewards/rejected": 1.3996782302856445, "step": 7319 }, { "epoch": 3.95, "learning_rate": 4.3943887480529794e-11, "logits/chosen": -2.0627849102020264, "logits/rejected": -2.2406318187713623, "logps/chosen": -0.3396487832069397, "logps/rejected": -0.3523291349411011, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9975134134292603, "rewards/margins": 0.0201987624168396, "rewards/rejected": 0.9773146510124207, "step": 7320 }, { "epoch": 3.95, "learning_rate": 4.303328870332024e-11, "logits/chosen": -2.1270651817321777, "logits/rejected": -2.211926221847534, "logps/chosen": -1.242603063583374, "logps/rejected": -16.186189651489258, "loss": 0.5973, "rewards/accuracies": 1.0, "rewards/chosen": 1.273910641670227, "rewards/margins": 0.20190799236297607, "rewards/rejected": 1.072002649307251, "step": 7321 }, { "epoch": 3.95, "learning_rate": 4.213221954820212e-11, "logits/chosen": -2.162721633911133, "logits/rejected": -2.169288158416748, "logps/chosen": -4.445677757263184, "logps/rejected": -2.818087577819824, "loss": 0.6613, "rewards/accuracies": 1.0, "rewards/chosen": 1.2978051900863647, "rewards/margins": 0.06466472148895264, "rewards/rejected": 1.233140468597412, "step": 7322 }, { "epoch": 3.95, "learning_rate": 4.1240680187076824e-11, "logits/chosen": -1.9937421083450317, "logits/rejected": -2.307034730911255, "logps/chosen": -0.5174667239189148, "logps/rejected": -0.5546855926513672, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 1.0759185552597046, "rewards/margins": 0.01708054542541504, "rewards/rejected": 1.0588380098342896, "step": 7323 }, { "epoch": 3.95, "learning_rate": 4.035867078999722e-11, "logits/chosen": -2.1157608032226562, "logits/rejected": -2.1307525634765625, "logps/chosen": -3.9949755668640137, "logps/rejected": -3.4880306720733643, "loss": 0.5169, "rewards/accuracies": 1.0, "rewards/chosen": 1.1988849639892578, "rewards/margins": 0.3902796506881714, "rewards/rejected": 0.8086053133010864, "step": 7324 }, { "epoch": 3.95, "learning_rate": 3.9486191525217596e-11, "logits/chosen": -2.0098533630371094, "logits/rejected": -2.2967920303344727, "logps/chosen": -1.1546173095703125, "logps/rejected": -3.987018346786499, "loss": 0.6326, "rewards/accuracies": 1.0, "rewards/chosen": 1.0063402652740479, "rewards/margins": 0.12509214878082275, "rewards/rejected": 0.8812481164932251, "step": 7325 }, { "epoch": 3.95, "learning_rate": 3.8623242559177037e-11, "logits/chosen": -2.0673320293426514, "logits/rejected": -2.0791780948638916, "logps/chosen": -5.0970139503479, "logps/rejected": -2.5289790630340576, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 1.3896490335464478, "rewards/margins": 0.6513324975967407, "rewards/rejected": 0.738316535949707, "step": 7326 }, { "epoch": 3.95, "learning_rate": 3.7769824056471663e-11, "logits/chosen": -2.0450503826141357, "logits/rejected": -2.2749269008636475, "logps/chosen": -0.42054492235183716, "logps/rejected": -11.111480712890625, "loss": 0.6647, "rewards/accuracies": 1.0, "rewards/chosen": 0.9442763328552246, "rewards/margins": 0.05769127607345581, "rewards/rejected": 0.8865850567817688, "step": 7327 }, { "epoch": 3.95, "learning_rate": 3.692593617991568e-11, "logits/chosen": -2.1289165019989014, "logits/rejected": -2.141552448272705, "logps/chosen": -1.3936381340026855, "logps/rejected": -6.221045017242432, "loss": 0.3078, "rewards/accuracies": 1.0, "rewards/chosen": 1.4365497827529907, "rewards/margins": 1.0203020572662354, "rewards/rejected": 0.416247695684433, "step": 7328 }, { "epoch": 3.95, "learning_rate": 3.6091579090469224e-11, "logits/chosen": -2.1307718753814697, "logits/rejected": -2.0626182556152344, "logps/chosen": -9.641826629638672, "logps/rejected": -7.554235458374023, "loss": 0.6583, "rewards/accuracies": 1.0, "rewards/chosen": 0.7460846304893494, "rewards/margins": 0.07091772556304932, "rewards/rejected": 0.6751669049263, "step": 7329 }, { "epoch": 3.95, "learning_rate": 3.526675294730497e-11, "logits/chosen": -2.125433921813965, "logits/rejected": -2.3033361434936523, "logps/chosen": -4.8735737800598145, "logps/rejected": -4.10950231552124, "loss": 0.7247, "rewards/accuracies": 0.0, "rewards/chosen": 0.8652305603027344, "rewards/margins": -0.062067627906799316, "rewards/rejected": 0.9272981882095337, "step": 7330 }, { "epoch": 3.95, "learning_rate": 3.4451457907758165e-11, "logits/chosen": -2.154103994369507, "logits/rejected": -2.335740327835083, "logps/chosen": -1.1346473693847656, "logps/rejected": -1.2504632472991943, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 1.0281428098678589, "rewards/margins": 0.004609584808349609, "rewards/rejected": 1.0235332250595093, "step": 7331 }, { "epoch": 3.95, "learning_rate": 3.3645694127348853e-11, "logits/chosen": -2.0063846111297607, "logits/rejected": -2.000576972961426, "logps/chosen": -6.151403427124023, "logps/rejected": -3.8526053428649902, "loss": 0.3983, "rewards/accuracies": 1.0, "rewards/chosen": 1.715578317642212, "rewards/margins": 0.714646577835083, "rewards/rejected": 1.000931739807129, "step": 7332 }, { "epoch": 3.96, "learning_rate": 3.284946175978187e-11, "logits/chosen": -2.111602783203125, "logits/rejected": -2.119278907775879, "logps/chosen": -2.27231764793396, "logps/rejected": -7.452023506164551, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": 1.2493566274642944, "rewards/margins": 1.002091646194458, "rewards/rejected": 0.24726496636867523, "step": 7333 }, { "epoch": 3.96, "learning_rate": 3.206276095695237e-11, "logits/chosen": -2.075885057449341, "logits/rejected": -2.322925090789795, "logps/chosen": -0.8911877274513245, "logps/rejected": -5.851846694946289, "loss": 0.6254, "rewards/accuracies": 1.0, "rewards/chosen": 0.898281991481781, "rewards/margins": 0.14043039083480835, "rewards/rejected": 0.7578516006469727, "step": 7334 }, { "epoch": 3.96, "learning_rate": 3.128559186892365e-11, "logits/chosen": -2.095764636993408, "logits/rejected": -2.275851249694824, "logps/chosen": -1.9549580812454224, "logps/rejected": -1.9481908082962036, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 1.0120196342468262, "rewards/margins": -0.001262664794921875, "rewards/rejected": 1.013282299041748, "step": 7335 }, { "epoch": 3.96, "learning_rate": 3.05179546439438e-11, "logits/chosen": -2.029074192047119, "logits/rejected": -2.2959144115448, "logps/chosen": -1.6601706743240356, "logps/rejected": -6.1090989112854, "loss": 0.6259, "rewards/accuracies": 1.0, "rewards/chosen": 0.9724947214126587, "rewards/margins": 0.13928699493408203, "rewards/rejected": 0.8332077264785767, "step": 7336 }, { "epoch": 3.96, "learning_rate": 2.9759849428445674e-11, "logits/chosen": -2.13700532913208, "logits/rejected": -2.3261361122131348, "logps/chosen": -0.1530994176864624, "logps/rejected": -0.21985028684139252, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.9090803265571594, "rewards/margins": 0.03640782833099365, "rewards/rejected": 0.8726724982261658, "step": 7337 }, { "epoch": 3.96, "learning_rate": 2.9011276367041372e-11, "logits/chosen": -1.9964679479599, "logits/rejected": -1.9949259757995605, "logps/chosen": -0.2927197217941284, "logps/rejected": -4.110044002532959, "loss": 0.5072, "rewards/accuracies": 1.0, "rewards/chosen": 1.0021700859069824, "rewards/margins": 0.41466355323791504, "rewards/rejected": 0.5875065326690674, "step": 7338 }, { "epoch": 3.96, "learning_rate": 2.8272235602533335e-11, "logits/chosen": -2.129911184310913, "logits/rejected": -2.114950180053711, "logps/chosen": -13.415046691894531, "logps/rejected": -7.15829610824585, "loss": 0.236, "rewards/accuracies": 1.0, "rewards/chosen": 1.5802478790283203, "rewards/margins": 1.323769211769104, "rewards/rejected": 0.2564786374568939, "step": 7339 }, { "epoch": 3.96, "learning_rate": 2.754272727588658e-11, "logits/chosen": -2.143528461456299, "logits/rejected": -2.360193967819214, "logps/chosen": -0.27355289459228516, "logps/rejected": -0.2711895704269409, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.8537287712097168, "rewards/margins": 0.035264790058135986, "rewards/rejected": 0.8184639811515808, "step": 7340 }, { "epoch": 3.96, "learning_rate": 2.6822751526273114e-11, "logits/chosen": -2.0524473190307617, "logits/rejected": -2.038734197616577, "logps/chosen": -3.329631805419922, "logps/rejected": -3.229593276977539, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 1.7320137023925781, "rewards/margins": 0.9773485064506531, "rewards/rejected": 0.754665195941925, "step": 7341 }, { "epoch": 3.96, "learning_rate": 2.611230849102197e-11, "logits/chosen": -2.0897984504699707, "logits/rejected": -2.24755597114563, "logps/chosen": -0.8928627371788025, "logps/rejected": -0.7734202146530151, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.8065967559814453, "rewards/margins": 0.033573806285858154, "rewards/rejected": 0.7730229496955872, "step": 7342 }, { "epoch": 3.96, "learning_rate": 2.5411398305663633e-11, "logits/chosen": -1.991376280784607, "logits/rejected": -1.9910087585449219, "logps/chosen": -2.8270184993743896, "logps/rejected": -3.307481050491333, "loss": 0.4115, "rewards/accuracies": 1.0, "rewards/chosen": 1.3406918048858643, "rewards/margins": 0.6751506328582764, "rewards/rejected": 0.6655411720275879, "step": 7343 }, { "epoch": 3.96, "learning_rate": 2.4720021103891153e-11, "logits/chosen": -2.110365390777588, "logits/rejected": -2.3039495944976807, "logps/chosen": -6.116530895233154, "logps/rejected": -6.569226264953613, "loss": 0.648, "rewards/accuracies": 1.0, "rewards/chosen": 1.434834361076355, "rewards/margins": 0.09248161315917969, "rewards/rejected": 1.3423527479171753, "step": 7344 }, { "epoch": 3.96, "learning_rate": 2.4038177017599026e-11, "logits/chosen": -2.0514369010925293, "logits/rejected": -2.3605363368988037, "logps/chosen": -4.521922588348389, "logps/rejected": -5.656064033508301, "loss": 0.6285, "rewards/accuracies": 1.0, "rewards/chosen": 0.7153922915458679, "rewards/margins": 0.13369965553283691, "rewards/rejected": 0.581692636013031, "step": 7345 }, { "epoch": 3.96, "learning_rate": 2.336586617684988e-11, "logits/chosen": -2.0552940368652344, "logits/rejected": -2.0511019229888916, "logps/chosen": -7.947032928466797, "logps/rejected": -1.9655483961105347, "loss": 0.3361, "rewards/accuracies": 1.0, "rewards/chosen": 1.588377833366394, "rewards/margins": 0.9175967574119568, "rewards/rejected": 0.6707810759544373, "step": 7346 }, { "epoch": 3.96, "learning_rate": 2.2703088709891128e-11, "logits/chosen": -2.111459493637085, "logits/rejected": -2.3183882236480713, "logps/chosen": -0.8215621709823608, "logps/rejected": -0.814269483089447, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.7777388095855713, "rewards/margins": 0.015161395072937012, "rewards/rejected": 0.7625774145126343, "step": 7347 }, { "epoch": 3.96, "learning_rate": 2.2049844743149416e-11, "logits/chosen": -2.06376051902771, "logits/rejected": -2.2973265647888184, "logps/chosen": -0.08575907349586487, "logps/rejected": -0.09608293324708939, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.9772645235061646, "rewards/margins": 0.022694289684295654, "rewards/rejected": 0.9545702338218689, "step": 7348 }, { "epoch": 3.96, "learning_rate": 2.1406134401241726e-11, "logits/chosen": -2.078167676925659, "logits/rejected": -2.2570767402648926, "logps/chosen": -0.3352143168449402, "logps/rejected": -0.29192546010017395, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8676291704177856, "rewards/margins": 0.018278419971466064, "rewards/rejected": 0.8493507504463196, "step": 7349 }, { "epoch": 3.96, "learning_rate": 2.077195780695318e-11, "logits/chosen": -2.0745599269866943, "logits/rejected": -2.147839307785034, "logps/chosen": -1.5443949699401855, "logps/rejected": -18.56917953491211, "loss": 0.5496, "rewards/accuracies": 1.0, "rewards/chosen": 1.4251056909561157, "rewards/margins": 0.31112444400787354, "rewards/rejected": 1.1139812469482422, "step": 7350 }, { "epoch": 3.96, "learning_rate": 2.014731508125922e-11, "logits/chosen": -2.071232557296753, "logits/rejected": -2.325740098953247, "logps/chosen": -1.4800724983215332, "logps/rejected": -1.4598400592803955, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.955108642578125, "rewards/margins": 0.03334075212478638, "rewards/rejected": 0.9217678904533386, "step": 7351 }, { "epoch": 3.97, "learning_rate": 1.953220634332009e-11, "logits/chosen": -2.2140018939971924, "logits/rejected": -2.225942611694336, "logps/chosen": -1.4465686082839966, "logps/rejected": -4.530745029449463, "loss": 0.3973, "rewards/accuracies": 1.0, "rewards/chosen": 1.3556593656539917, "rewards/margins": 0.71798175573349, "rewards/rejected": 0.6376776099205017, "step": 7352 }, { "epoch": 3.97, "learning_rate": 1.8926631710464158e-11, "logits/chosen": -2.0865426063537598, "logits/rejected": -2.3366618156433105, "logps/chosen": -12.516108512878418, "logps/rejected": -14.83060073852539, "loss": 0.5722, "rewards/accuracies": 1.0, "rewards/chosen": 1.0483702421188354, "rewards/margins": 0.2585829496383667, "rewards/rejected": 0.7897872924804688, "step": 7353 }, { "epoch": 3.97, "learning_rate": 1.833059129821568e-11, "logits/chosen": -2.0425703525543213, "logits/rejected": -2.0344038009643555, "logps/chosen": -1.919392466545105, "logps/rejected": -6.273191928863525, "loss": 0.3082, "rewards/accuracies": 1.0, "rewards/chosen": 1.3365817070007324, "rewards/margins": 1.0190261602401733, "rewards/rejected": 0.31755557656288147, "step": 7354 }, { "epoch": 3.97, "learning_rate": 1.7744085220267047e-11, "logits/chosen": -2.054452657699585, "logits/rejected": -2.3600869178771973, "logps/chosen": -3.7765774726867676, "logps/rejected": -6.855442523956299, "loss": 0.6153, "rewards/accuracies": 1.0, "rewards/chosen": 1.2308813333511353, "rewards/margins": 0.1622234582901001, "rewards/rejected": 1.0686578750610352, "step": 7355 }, { "epoch": 3.97, "learning_rate": 1.716711358850098e-11, "logits/chosen": -2.1246001720428467, "logits/rejected": -1.9886153936386108, "logps/chosen": -16.012218475341797, "logps/rejected": -8.094816207885742, "loss": 0.2856, "rewards/accuracies": 1.0, "rewards/chosen": 1.9660828113555908, "rewards/margins": 1.10695481300354, "rewards/rejected": 0.8591279983520508, "step": 7356 }, { "epoch": 3.97, "learning_rate": 1.6599676512979443e-11, "logits/chosen": -2.214639186859131, "logits/rejected": -2.201439142227173, "logps/chosen": -9.459988594055176, "logps/rejected": -2.0289106369018555, "loss": 0.4552, "rewards/accuracies": 1.0, "rewards/chosen": 1.6409645080566406, "rewards/margins": 0.5508550405502319, "rewards/rejected": 1.0901094675064087, "step": 7357 }, { "epoch": 3.97, "learning_rate": 1.604177410194363e-11, "logits/chosen": -2.133620500564575, "logits/rejected": -2.0141618251800537, "logps/chosen": -13.30015754699707, "logps/rejected": -10.63247013092041, "loss": 0.3092, "rewards/accuracies": 1.0, "rewards/chosen": 1.9128602743148804, "rewards/margins": 1.015233039855957, "rewards/rejected": 0.8976271748542786, "step": 7358 }, { "epoch": 3.97, "learning_rate": 1.549340646181396e-11, "logits/chosen": -2.2071921825408936, "logits/rejected": -2.0998549461364746, "logps/chosen": -22.325916290283203, "logps/rejected": -3.968325614929199, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 2.548724889755249, "rewards/margins": 2.1261842250823975, "rewards/rejected": 0.4225405752658844, "step": 7359 }, { "epoch": 3.97, "learning_rate": 1.4954573697206764e-11, "logits/chosen": -2.0017154216766357, "logits/rejected": -1.9995412826538086, "logps/chosen": -0.6207669973373413, "logps/rejected": -7.321352481842041, "loss": 0.6174, "rewards/accuracies": 1.0, "rewards/chosen": 0.9186844229698181, "rewards/margins": 0.15772825479507446, "rewards/rejected": 0.7609561681747437, "step": 7360 }, { "epoch": 3.97, "learning_rate": 1.4425275910889822e-11, "logits/chosen": -2.2022883892059326, "logits/rejected": -2.2528371810913086, "logps/chosen": -0.3284015953540802, "logps/rejected": -0.3584863245487213, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.8771986365318298, "rewards/margins": 0.005338132381439209, "rewards/rejected": 0.8718605041503906, "step": 7361 }, { "epoch": 3.97, "learning_rate": 1.3905513203849028e-11, "logits/chosen": -2.033769369125366, "logits/rejected": -2.0472540855407715, "logps/chosen": -1.247413158416748, "logps/rejected": -7.076739311218262, "loss": 0.4209, "rewards/accuracies": 1.0, "rewards/chosen": 1.150687575340271, "rewards/margins": 0.6474859118461609, "rewards/rejected": 0.5032016634941101, "step": 7362 }, { "epoch": 3.97, "learning_rate": 1.3395285675216195e-11, "logits/chosen": -2.122070074081421, "logits/rejected": -2.118638038635254, "logps/chosen": -0.7985627055168152, "logps/rejected": -3.3676517009735107, "loss": 0.4651, "rewards/accuracies": 1.0, "rewards/chosen": 1.1883219480514526, "rewards/margins": 0.5240077376365662, "rewards/rejected": 0.6643142104148865, "step": 7363 }, { "epoch": 3.97, "learning_rate": 1.2894593422335675e-11, "logits/chosen": -2.068351984024048, "logits/rejected": -2.3184499740600586, "logps/chosen": -4.335740089416504, "logps/rejected": -4.385605335235596, "loss": 0.6648, "rewards/accuracies": 1.0, "rewards/chosen": 0.7067072987556458, "rewards/margins": 0.05756199359893799, "rewards/rejected": 0.6491453051567078, "step": 7364 }, { "epoch": 3.97, "learning_rate": 1.2403436540703305e-11, "logits/chosen": -2.2064156532287598, "logits/rejected": -2.315542459487915, "logps/chosen": -0.6970962285995483, "logps/rejected": -0.735927402973175, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.9297884106636047, "rewards/margins": 0.017571568489074707, "rewards/rejected": 0.91221684217453, "step": 7365 }, { "epoch": 3.97, "learning_rate": 1.1921815124021905e-11, "logits/chosen": -2.0394091606140137, "logits/rejected": -2.2740440368652344, "logps/chosen": -8.834171295166016, "logps/rejected": -5.471978664398193, "loss": 0.771, "rewards/accuracies": 0.0, "rewards/chosen": 0.8867576718330383, "rewards/margins": -0.15009349584579468, "rewards/rejected": 1.036851167678833, "step": 7366 }, { "epoch": 3.97, "learning_rate": 1.1449729264156881e-11, "logits/chosen": -2.12746000289917, "logits/rejected": -2.155416250228882, "logps/chosen": -2.6865105628967285, "logps/rejected": -13.433433532714844, "loss": 0.2419, "rewards/accuracies": 1.0, "rewards/chosen": 1.6072202920913696, "rewards/margins": 1.2959306240081787, "rewards/rejected": 0.31128960847854614, "step": 7367 }, { "epoch": 3.97, "learning_rate": 1.0987179051169526e-11, "logits/chosen": -2.171908140182495, "logits/rejected": -2.1701531410217285, "logps/chosen": -19.821819305419922, "logps/rejected": -8.100213050842285, "loss": 0.2565, "rewards/accuracies": 1.0, "rewards/chosen": 1.7103573083877563, "rewards/margins": 1.2296473979949951, "rewards/rejected": 0.48070985078811646, "step": 7368 }, { "epoch": 3.97, "learning_rate": 1.053416457328371e-11, "logits/chosen": -2.0645599365234375, "logits/rejected": -2.0572774410247803, "logps/chosen": -3.7010583877563477, "logps/rejected": -3.3992373943328857, "loss": 0.3977, "rewards/accuracies": 1.0, "rewards/chosen": 1.8404206037521362, "rewards/margins": 0.7166633605957031, "rewards/rejected": 1.123757243156433, "step": 7369 }, { "epoch": 3.98, "learning_rate": 1.0090685916924746e-11, "logits/chosen": -2.1735966205596924, "logits/rejected": -2.174050807952881, "logps/chosen": -0.21910324692726135, "logps/rejected": -6.407172679901123, "loss": 0.4162, "rewards/accuracies": 1.0, "rewards/chosen": 0.884623646736145, "rewards/margins": 0.6611614227294922, "rewards/rejected": 0.22346225380897522, "step": 7370 }, { "epoch": 3.98, "learning_rate": 9.656743166680526e-12, "logits/chosen": -2.0504424571990967, "logits/rejected": -2.060839891433716, "logps/chosen": -1.9846627712249756, "logps/rejected": -2.6492743492126465, "loss": 0.4356, "rewards/accuracies": 1.0, "rewards/chosen": 1.268964409828186, "rewards/margins": 0.6053586602210999, "rewards/rejected": 0.6636057496070862, "step": 7371 }, { "epoch": 3.98, "learning_rate": 9.232336405334828e-12, "logits/chosen": -2.1456997394561768, "logits/rejected": -2.1468875408172607, "logps/chosen": -0.5387425422668457, "logps/rejected": -8.891739845275879, "loss": 0.3796, "rewards/accuracies": 1.0, "rewards/chosen": 0.9571670889854431, "rewards/margins": 0.772904634475708, "rewards/rejected": 0.1842624694108963, "step": 7372 }, { "epoch": 3.98, "learning_rate": 8.817465713839568e-12, "logits/chosen": -2.1722261905670166, "logits/rejected": -2.3862152099609375, "logps/chosen": -0.5031424164772034, "logps/rejected": -0.5469749569892883, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9670912623405457, "rewards/margins": 0.02159935235977173, "rewards/rejected": 0.9454919099807739, "step": 7373 }, { "epoch": 3.98, "learning_rate": 8.412131171348091e-12, "logits/chosen": -2.1446621417999268, "logits/rejected": -1.9606716632843018, "logps/chosen": -33.90470886230469, "logps/rejected": -2.9903228282928467, "loss": 0.1724, "rewards/accuracies": 1.0, "rewards/chosen": 2.3326733112335205, "rewards/margins": 1.6704096794128418, "rewards/rejected": 0.6622636914253235, "step": 7374 }, { "epoch": 3.98, "learning_rate": 8.01633285516523e-12, "logits/chosen": -2.121990203857422, "logits/rejected": -2.121039628982544, "logps/chosen": -0.4798082411289215, "logps/rejected": -1.7195172309875488, "loss": 0.5949, "rewards/accuracies": 1.0, "rewards/chosen": 1.0237818956375122, "rewards/margins": 0.20722496509552002, "rewards/rejected": 0.8165569305419922, "step": 7375 }, { "epoch": 3.98, "learning_rate": 7.630070840797253e-12, "logits/chosen": -2.0224251747131348, "logits/rejected": -2.2740933895111084, "logps/chosen": -3.145656108856201, "logps/rejected": -2.626858711242676, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 0.6127820611000061, "rewards/margins": -0.014178872108459473, "rewards/rejected": 0.6269609332084656, "step": 7376 }, { "epoch": 3.98, "learning_rate": 7.253345201924111e-12, "logits/chosen": -2.193192958831787, "logits/rejected": -2.335345506668091, "logps/chosen": -6.893474578857422, "logps/rejected": -6.661505699157715, "loss": 0.7029, "rewards/accuracies": 0.0, "rewards/chosen": 0.5130629539489746, "rewards/margins": -0.019405066967010498, "rewards/rejected": 0.5324680209159851, "step": 7377 }, { "epoch": 3.98, "learning_rate": 6.8861560104160884e-12, "logits/chosen": -2.0054574012756348, "logits/rejected": -2.0037150382995605, "logps/chosen": -2.4541914463043213, "logps/rejected": -4.098917007446289, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 1.2286571264266968, "rewards/margins": 0.7617889046669006, "rewards/rejected": 0.46686822175979614, "step": 7378 }, { "epoch": 3.98, "learning_rate": 6.528503336311608e-12, "logits/chosen": -2.0479016304016113, "logits/rejected": -2.0459883213043213, "logps/chosen": -0.39225825667381287, "logps/rejected": -2.0053555965423584, "loss": 0.5675, "rewards/accuracies": 1.0, "rewards/chosen": 1.0340265035629272, "rewards/margins": 0.2693362832069397, "rewards/rejected": 0.7646902203559875, "step": 7379 }, { "epoch": 3.98, "learning_rate": 6.180387247839425e-12, "logits/chosen": -2.0155889987945557, "logits/rejected": -2.016904592514038, "logps/chosen": -0.13612811267375946, "logps/rejected": -6.856433391571045, "loss": 0.4371, "rewards/accuracies": 1.0, "rewards/chosen": 0.9010304808616638, "rewards/margins": 0.6011472940444946, "rewards/rejected": 0.2998832166194916, "step": 7380 }, { "epoch": 3.98, "learning_rate": 5.841807811396426e-12, "logits/chosen": -2.096897602081299, "logits/rejected": -2.099844217300415, "logps/chosen": -0.28106746077537537, "logps/rejected": -5.202974796295166, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.0912171602249146, "rewards/margins": 0.6483749151229858, "rewards/rejected": 0.4428422451019287, "step": 7381 }, { "epoch": 3.98, "learning_rate": 5.512765091575389e-12, "logits/chosen": -2.2586519718170166, "logits/rejected": -2.051131248474121, "logps/chosen": -35.06773376464844, "logps/rejected": -4.870511531829834, "loss": 0.0734, "rewards/accuracies": 1.0, "rewards/chosen": 3.011852264404297, "rewards/margins": 2.5745270252227783, "rewards/rejected": 0.4373251497745514, "step": 7382 }, { "epoch": 3.98, "learning_rate": 5.1932591511427706e-12, "logits/chosen": -2.0444908142089844, "logits/rejected": -2.2634854316711426, "logps/chosen": -9.901285171508789, "logps/rejected": -8.50031566619873, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.5529794096946716, "rewards/margins": 0.02984219789505005, "rewards/rejected": 0.5231372117996216, "step": 7383 }, { "epoch": 3.98, "learning_rate": 4.883290051049815e-12, "logits/chosen": -2.107762575149536, "logits/rejected": -2.275062084197998, "logps/chosen": -4.497285842895508, "logps/rejected": -4.432743072509766, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.7566726803779602, "rewards/margins": 0.018195629119873047, "rewards/rejected": 0.7384770512580872, "step": 7384 }, { "epoch": 3.98, "learning_rate": 4.5828578504159e-12, "logits/chosen": -2.0617616176605225, "logits/rejected": -2.0407159328460693, "logps/chosen": -12.333930969238281, "logps/rejected": -5.14506196975708, "loss": 0.3692, "rewards/accuracies": 1.0, "rewards/chosen": 1.8935775756835938, "rewards/margins": 0.8062191009521484, "rewards/rejected": 1.0873584747314453, "step": 7385 }, { "epoch": 3.98, "learning_rate": 4.291962606556288e-12, "logits/chosen": -2.066798210144043, "logits/rejected": -2.069340944290161, "logps/chosen": -4.191924095153809, "logps/rejected": -3.3110947608947754, "loss": 0.1287, "rewards/accuracies": 1.0, "rewards/chosen": 2.4734458923339844, "rewards/margins": 1.985461711883545, "rewards/rejected": 0.48798415064811707, "step": 7386 }, { "epoch": 3.98, "learning_rate": 4.010604374959925e-12, "logits/chosen": -2.0595505237579346, "logits/rejected": -2.0467467308044434, "logps/chosen": -2.756484031677246, "logps/rejected": -2.2561402320861816, "loss": 0.3399, "rewards/accuracies": 1.0, "rewards/chosen": 1.7351264953613281, "rewards/margins": 0.9044738411903381, "rewards/rejected": 0.83065265417099, "step": 7387 }, { "epoch": 3.98, "learning_rate": 3.738783209300544e-12, "logits/chosen": -2.2777981758117676, "logits/rejected": -2.1709489822387695, "logps/chosen": -24.063560485839844, "logps/rejected": -4.265956878662109, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 2.1762337684631348, "rewards/margins": 1.7395819425582886, "rewards/rejected": 0.4366517961025238, "step": 7388 }, { "epoch": 3.99, "learning_rate": 3.4764991614255614e-12, "logits/chosen": -2.015263795852661, "logits/rejected": -2.022534132003784, "logps/chosen": -2.398155450820923, "logps/rejected": -1.6683804988861084, "loss": 0.5476, "rewards/accuracies": 1.0, "rewards/chosen": 1.1621179580688477, "rewards/margins": 0.31600475311279297, "rewards/rejected": 0.8461132049560547, "step": 7389 }, { "epoch": 3.99, "learning_rate": 3.223752281372727e-12, "logits/chosen": -2.1062421798706055, "logits/rejected": -2.1117899417877197, "logps/chosen": -2.0418035984039307, "logps/rejected": -13.031444549560547, "loss": 0.5153, "rewards/accuracies": 1.0, "rewards/chosen": 1.3361842632293701, "rewards/margins": 0.3942420482635498, "rewards/rejected": 0.9419422149658203, "step": 7390 }, { "epoch": 3.99, "learning_rate": 2.9805426173479255e-12, "logits/chosen": -2.0369017124176025, "logits/rejected": -2.042724370956421, "logps/chosen": -0.8627070188522339, "logps/rejected": -4.620057106018066, "loss": 0.3918, "rewards/accuracies": 1.0, "rewards/chosen": 0.992989718914032, "rewards/margins": 0.7347512245178223, "rewards/rejected": 0.2582385241985321, "step": 7391 }, { "epoch": 3.99, "learning_rate": 2.746870215752928e-12, "logits/chosen": -2.1986403465270996, "logits/rejected": -2.326458215713501, "logps/chosen": -0.3720126450061798, "logps/rejected": -0.37068700790405273, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 1.04972505569458, "rewards/margins": 0.020034313201904297, "rewards/rejected": 1.0296907424926758, "step": 7392 }, { "epoch": 3.99, "learning_rate": 2.5227351211576376e-12, "logits/chosen": -2.1536777019500732, "logits/rejected": -2.15385103225708, "logps/chosen": -1.9802998304367065, "logps/rejected": -4.468790054321289, "loss": 0.2642, "rewards/accuracies": 1.0, "rewards/chosen": 1.5884145498275757, "rewards/margins": 1.1960506439208984, "rewards/rejected": 0.39236384630203247, "step": 7393 }, { "epoch": 3.99, "learning_rate": 2.3081373763167435e-12, "logits/chosen": -2.055516004562378, "logits/rejected": -2.340799331665039, "logps/chosen": -1.1149365901947021, "logps/rejected": -1.0555579662322998, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.9591760039329529, "rewards/margins": 0.021899044513702393, "rewards/rejected": 0.9372769594192505, "step": 7394 }, { "epoch": 3.99, "learning_rate": 2.1030770221641682e-12, "logits/chosen": -2.1129019260406494, "logits/rejected": -2.1200597286224365, "logps/chosen": -3.0783443450927734, "logps/rejected": -5.059161186218262, "loss": 0.3974, "rewards/accuracies": 1.0, "rewards/chosen": 1.3697751760482788, "rewards/margins": 0.7176834940910339, "rewards/rejected": 0.6520916819572449, "step": 7395 }, { "epoch": 3.99, "learning_rate": 1.907554097824171e-12, "logits/chosen": -2.002411127090454, "logits/rejected": -2.0113611221313477, "logps/chosen": -2.5475573539733887, "logps/rejected": -3.088233470916748, "loss": 0.4094, "rewards/accuracies": 1.0, "rewards/chosen": 1.2711549997329712, "rewards/margins": 0.681437075138092, "rewards/rejected": 0.5897179245948792, "step": 7396 }, { "epoch": 3.99, "learning_rate": 1.7215686405891439e-12, "logits/chosen": -2.007660150527954, "logits/rejected": -2.2768611907958984, "logps/chosen": -0.6364229321479797, "logps/rejected": -0.5623350143432617, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9878837466239929, "rewards/margins": 0.04944199323654175, "rewards/rejected": 0.9384417533874512, "step": 7397 }, { "epoch": 3.99, "learning_rate": 1.5451206859418142e-12, "logits/chosen": -1.9984967708587646, "logits/rejected": -2.2682764530181885, "logps/chosen": -0.33522462844848633, "logps/rejected": -0.3013833165168762, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.9113615155220032, "rewards/margins": 0.006043076515197754, "rewards/rejected": 0.9053184390068054, "step": 7398 }, { "epoch": 3.99, "learning_rate": 1.3782102675274909e-12, "logits/chosen": -2.0468928813934326, "logits/rejected": -2.321457624435425, "logps/chosen": -0.593029260635376, "logps/rejected": -0.5779017210006714, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.8960954546928406, "rewards/margins": 0.035383760929107666, "rewards/rejected": 0.8607116937637329, "step": 7399 }, { "epoch": 3.99, "learning_rate": 1.2208374172040237e-12, "logits/chosen": -2.1571686267852783, "logits/rejected": -2.1566309928894043, "logps/chosen": -0.5325483083724976, "logps/rejected": -2.6662495136260986, "loss": 0.6378, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491912722587585, "rewards/margins": 0.1138426661491394, "rewards/rejected": 0.8353486061096191, "step": 7400 }, { "epoch": 3.99, "learning_rate": 1.0730021649751898e-12, "logits/chosen": -2.1919057369232178, "logits/rejected": -2.189439058303833, "logps/chosen": -3.2988479137420654, "logps/rejected": -12.612401962280273, "loss": 0.3182, "rewards/accuracies": 1.0, "rewards/chosen": 1.268154501914978, "rewards/margins": 0.9817033410072327, "rewards/rejected": 0.28645116090774536, "step": 7401 }, { "epoch": 3.99, "learning_rate": 9.347045390517561e-13, "logits/chosen": -2.075138807296753, "logits/rejected": -2.254096269607544, "logps/chosen": -0.444797158241272, "logps/rejected": -0.40909337997436523, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9371557235717773, "rewards/margins": 0.02243250608444214, "rewards/rejected": 0.9147232174873352, "step": 7402 }, { "epoch": 3.99, "learning_rate": 8.059445658070707e-13, "logits/chosen": -2.076500177383423, "logits/rejected": -2.0828473567962646, "logps/chosen": -2.9666500091552734, "logps/rejected": -6.120027542114258, "loss": 0.3616, "rewards/accuracies": 1.0, "rewards/chosen": 1.698830008506775, "rewards/margins": 0.8309617638587952, "rewards/rejected": 0.8678682446479797, "step": 7403 }, { "epoch": 3.99, "learning_rate": 6.867222698103691e-13, "logits/chosen": -2.142068862915039, "logits/rejected": -2.313127040863037, "logps/chosen": -3.2081682682037354, "logps/rejected": -1.1702640056610107, "loss": 0.6405, "rewards/accuracies": 1.0, "rewards/chosen": 0.9924623370170593, "rewards/margins": 0.10821032524108887, "rewards/rejected": 0.8842520117759705, "step": 7404 }, { "epoch": 3.99, "learning_rate": 5.770376738045701e-13, "logits/chosen": -2.077983856201172, "logits/rejected": -2.075989246368408, "logps/chosen": -0.38643479347229004, "logps/rejected": -4.60651969909668, "loss": 0.4647, "rewards/accuracies": 1.0, "rewards/chosen": 1.0071161985397339, "rewards/margins": 0.5249419212341309, "rewards/rejected": 0.4821743071079254, "step": 7405 }, { "epoch": 3.99, "learning_rate": 4.768907987062753e-13, "logits/chosen": -2.1528000831604004, "logits/rejected": -2.1579325199127197, "logps/chosen": -1.1323771476745605, "logps/rejected": -5.36418342590332, "loss": 0.3887, "rewards/accuracies": 1.0, "rewards/chosen": 1.1202882528305054, "rewards/margins": 0.7442400455474854, "rewards/rejected": 0.37604817748069763, "step": 7406 }, { "epoch": 4.0, "learning_rate": 3.862816636224231e-13, "logits/chosen": -2.089419364929199, "logits/rejected": -2.0966074466705322, "logps/chosen": -2.178467035293579, "logps/rejected": -5.108529567718506, "loss": 0.3773, "rewards/accuracies": 1.0, "rewards/chosen": 1.2636946439743042, "rewards/margins": 0.7802713513374329, "rewards/rejected": 0.48342329263687134, "step": 7407 }, { "epoch": 4.0, "learning_rate": 3.0521028583363474e-13, "logits/chosen": -2.0695886611938477, "logits/rejected": -2.076974868774414, "logps/chosen": -1.8297775983810425, "logps/rejected": -4.047774791717529, "loss": 0.3162, "rewards/accuracies": 1.0, "rewards/chosen": 1.5029553174972534, "rewards/margins": 0.9890063405036926, "rewards/rejected": 0.5139489769935608, "step": 7408 }, { "epoch": 4.0, "learning_rate": 2.336766808108681e-13, "logits/chosen": -2.0583276748657227, "logits/rejected": -2.3131189346313477, "logps/chosen": -0.25375115871429443, "logps/rejected": -0.27783021330833435, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.8693665862083435, "rewards/margins": -0.006825685501098633, "rewards/rejected": 0.8761922717094421, "step": 7409 }, { "epoch": 4.0, "learning_rate": 1.716808621987642e-13, "logits/chosen": -2.2393882274627686, "logits/rejected": -2.372124195098877, "logps/chosen": -12.707411766052246, "logps/rejected": -8.536186218261719, "loss": 0.6592, "rewards/accuracies": 1.0, "rewards/chosen": 1.0778001546859741, "rewards/margins": 0.06907892227172852, "rewards/rejected": 1.0087212324142456, "step": 7410 }, { "epoch": 4.0, "learning_rate": 1.1922284181564712e-13, "logits/chosen": -2.207115888595581, "logits/rejected": -2.368659734725952, "logps/chosen": -2.247404098510742, "logps/rejected": -2.227222442626953, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9750878214836121, "rewards/margins": -0.00014650821685791016, "rewards/rejected": 0.97523432970047, "step": 7411 }, { "epoch": 4.0, "learning_rate": 7.630262968127965e-14, "logits/chosen": -2.102450132369995, "logits/rejected": -2.340350389480591, "logps/chosen": -0.4432161748409271, "logps/rejected": -0.5802180171012878, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8571262359619141, "rewards/margins": 0.019231140613555908, "rewards/rejected": 0.8378950953483582, "step": 7412 }, { "epoch": 4.0, "learning_rate": 4.292023397245437e-14, "logits/chosen": -1.9637553691864014, "logits/rejected": -2.2781667709350586, "logps/chosen": -0.23100446164608002, "logps/rejected": -0.28188157081604004, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.9196242690086365, "rewards/margins": 0.011614739894866943, "rewards/rejected": 0.9080095291137695, "step": 7413 }, { "epoch": 4.0, "learning_rate": 1.907566105630032e-14, "logits/chosen": -2.0616793632507324, "logits/rejected": -2.091718912124634, "logps/chosen": -3.735447645187378, "logps/rejected": -3.785083770751953, "loss": 0.5834, "rewards/accuracies": 1.0, "rewards/chosen": 1.2179516553878784, "rewards/margins": 0.23303669691085815, "rewards/rejected": 0.9849149584770203, "step": 7414 }, { "epoch": 4.0, "learning_rate": 4.768915490283021e-15, "logits/chosen": -2.152707576751709, "logits/rejected": -2.1539154052734375, "logps/chosen": -1.9977424144744873, "logps/rejected": -2.4535388946533203, "loss": 0.6001, "rewards/accuracies": 1.0, "rewards/chosen": 0.9037501215934753, "rewards/margins": 0.19563108682632446, "rewards/rejected": 0.7081190347671509, "step": 7415 }, { "epoch": 4.0, "learning_rate": 0.0, "logits/chosen": -2.0857491493225098, "logits/rejected": -2.081618309020996, "logps/chosen": -3.377639055252075, "logps/rejected": -3.2186942100524902, "loss": 0.554, "rewards/accuracies": 1.0, "rewards/chosen": 1.1095633506774902, "rewards/margins": 0.3008660078048706, "rewards/rejected": 0.8086973428726196, "step": 7416 }, { "epoch": 4.0, "step": 7416, "total_flos": 0.0, "train_loss": 0.5493184025786182, "train_runtime": 42990.6551, "train_samples_per_second": 0.173, "train_steps_per_second": 0.173 } ], "logging_steps": 1.0, "max_steps": 7416, "num_train_epochs": 4, "save_steps": 200, "total_flos": 0.0, "trial_name": null, "trial_params": null }